# load the packages we'll need
if (!require(sqldf, quietly=TRUE, warn.conflicts=FALSE)) { install.packages("sqldf"); require(sqldf) }
if (!require(readr, quietly=TRUE, warn.conflicts=FALSE)) { install.packages("readr"); require(readr) }
# Read in the data for the books database - see the
= read_csv("data/booksDatabase/titles.csv", na="NULL", show_col_types=FALSE)
titles = read_csv("data/booksDatabase/authors.csv", na="NULL", show_col_types=FALSE)
authors = read_csv("data/booksDatabase/publishers.csv", na="NULL", show_col_types=FALSE)
publishers = read_csv("data/booksDatabase/title_authors.csv", na="NULL", show_col_types=FALSE)
title_authors = read_csv("data/booksDatabase/royalties.csv", na="NULL", show_col_types=FALSE) royalties
33 33. more joins (cross/left/right/full), subqueries, set operations, case, self joins
33.1 The “books database”
In this and other sections we will be referring to the “books database”.
Click below to download a document that describes the books database and database technology in general. This document also contains many questions/answers to test your knowledge of SQL. You can also download the CSV files that contain the data for the books database.
33.2 click to download the specified files
33.3 read in the data
33.4 CROSS JOIN
##############################################################################
# A "cross join" is very different from the joins that we've already learned about.
# A "cross join" joins EVERY row from 1st table with EVERY row from 2nd table.
# For this reason, a cross join does not specify an on condition.
##############################################################################
# Example - suppose a kindergarten teacher is using a relational database
# to manage her classroom :) Suppose she has several toys and several children.
# One table, students, describes the children,
# Another table, toys, describes the toys.
<- data.frame(
students first_name = c("abie", "bobbie", "clara"),
last_name = c("aames", "berger", "cohen"),
gender = c("m", "m", "f")
) students
first_name last_name gender
1 abie aames m
2 bobbie berger m
3 clara cohen f
= data.frame(
toys toy_name = c("doggie", "dumbo"),
description = c("suffed dog", "elephant")
) toys
toy_name description
1 doggie suffed dog
2 dumbo elephant
sqldf("select * from students order by last_name, first_name")
first_name last_name gender
1 abie aames m
2 bobbie berger m
3 clara cohen f
sqldf("select * from toys order by toy_name")
toy_name description
1 doggie suffed dog
2 dumbo elephant
# The teacher wants to make sure that each child has a chance to play
# with each of the toys.
#
# Therefore she might run the following "cross join" to see a list
# of each child and each toy. She can use this result as a checklist
# to make sure that each child actually did play with each toy.
sqldf("select students.*, toys.*
from students CROSS JOIN toys
order by last_name, first_name, toy_name")
first_name last_name gender toy_name description
1 abie aames m doggie suffed dog
2 abie aames m dumbo elephant
3 bobbie berger m doggie suffed dog
4 bobbie berger m dumbo elephant
5 clara cohen f doggie suffed dog
6 clara cohen f dumbo elephant
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# About the asterisks ( * ) in the SELECT clause ...
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# The asterisks in the query above (i.e. select students.*, toys.* ...)
# means that the results should include all of the columns from the
# students table (i.e. students.*) and all of the columns from the
# toys table (i.e. toys.*).
#
# This can be done more concisely with just "select * ..." the query below
# does the exact same thing as the query above since "select * ..." means
# to include all of the columns from all of the tables in the output.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
sqldf("select *
from students CROSS JOIN toys
order by last_name, first_name, toy_name")
first_name last_name gender toy_name description
1 abie aames m doggie suffed dog
2 abie aames m dumbo elephant
3 bobbie berger m doggie suffed dog
4 bobbie berger m dumbo elephant
5 clara cohen f doggie suffed dog
6 clara cohen f dumbo elephant
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# The reason for using tablename.* is apparent when you want to
# include all of the columns from some of the tables but not from the other
# tables. For example the following query displays only some of the columns
# from the students table (students.gender is NOT displayed). It does display
# ALL of the columns from the toys table (ie. toys.*)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
sqldf("select students.first_name, students.last_name, toys.*
from students CROSS JOIN toys
order by last_name, first_name, toy_name")
first_name last_name toy_name description
1 abie aames doggie suffed dog
2 abie aames dumbo elephant
3 bobbie berger doggie suffed dog
4 bobbie berger dumbo elephant
5 clara cohen doggie suffed dog
6 clara cohen dumbo elephant
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# select table.column ... vs select column ...
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# You must include table names in the SELECT clause for columns that
# appear in more than one table. Column names that that only appear
# in a single table in the query do NOT need to be preceded by the table name.
#
# Therefore the following
# query is the same as the above query. This is because first_name and
# last_name only appear as columns in the students table and do not appear
# in the toys table.
#
# However, it is PERMISSIBLE to include the table name for ANY column (even
# if the column only appears in a single table).
# Some people prefer to include the table name even when it
# is not necessary (as done above) to make it clearer to someone reading the
# code which table contains which columns.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Same results as above (read comment above)
sqldf("select first_name, last_name, toys.*
from students CROSS JOIN toys
order by last_name, first_name, toy_name")
first_name last_name toy_name description
1 abie aames doggie suffed dog
2 abie aames dumbo elephant
3 bobbie berger doggie suffed dog
4 bobbie berger dumbo elephant
5 clara cohen doggie suffed dog
6 clara cohen dumbo elephant
#-----------------------------------------------------------
# # rows = #rows from table1 X #rows from table2
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# The total number of rows in the output of a "cross join" is
# the product of the number of rows in the first table times
# the number of rows in the 2nd table. In the example above, since
# students contains 3 rows and toys contains 2 rows, the cross
# join contains 6 rows.
#-----------------------------------------------------------
#-----------------------------------------------------------
# You can use any other clauses in a cross join
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# You an add any of the other clauses (where, group by, having, limit)
# to a "cross join". For example, if the teacher only needs the
# results of the "cross join" for the boys she can add a where
# clause as follows:
#-----------------------------------------------------------
sqldf("select students.*, toys.*
from students CROSS JOIN toys
where gender = 'm'
order by last_name, first_name, toy_name
")
first_name last_name gender toy_name description
1 abie aames m doggie suffed dog
2 abie aames m dumbo elephant
3 bobbie berger m doggie suffed dog
4 bobbie berger m dumbo elephant
#--------------------------------------------------------------------------
# "Cartesian product" is another name for a "cross join"
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# The concept of a "cross join" also exists outside the scope of databases.
# Specifically in mathematical "set theory" the concept of a "cross join"
# is known as a "cartesian product". Sometimes, people will refer to
# the term "cartesian product" in discussions about databases. However in
# a select statement the words you can type are only "cross join" (NOT "cartesian product").
#--------------------------------------------------------------------------
#--------------------------------------------------------------------------
# All other types of joins depend on cross joins
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# On some occasions the need for a cross join arises, but
# cross joins are not used nearly as often as other types of joins.
# Nevertheless, it is important to understand the concept of a cross join since
# every other type of join ("innner join", "left join", "right join", "full join")
# depends on a "cross join".
#
# For example when calculating the result of an
# "inner join" the database management system (DBMS) starts by first performing a cross join.
# The DBMS then removes rows from the result of the cross join based on the
# conditions specified in the on clause and the where clause of the "inner join".
#---------------------------------------------------------------------------
#---------------------------------------------------------------------------
# A more typical example
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Many people find it hard to understand why you would want to use a cross join.
# This confusion stems from the fact that rows from the two tables are combined
# in the result even if there is seemingly no logical reason to combine them.
#
# It is easiest to see the results of a cross join when you have very few rows.
# Therefore we will create two very small tables, favTitles and favPublishers
#---------------------------------------------------------------------------
# My favorite titles
= sqldf("select title_name, type, pub_id, pages
favTitles from titles
where title_id in ('T04', 'T05', 'T10')
order by title_name")
favTitles
title_name type pub_id pages
1 But I Did It Unconciously psychology P01 510
2 Exchange of Platitudes psychology P01 201
3 Not Without My Fabrerge Egg biography P05 NA
# Publishers of those favorite titles
= sqldf("select pub_id, pub_name, city
favPublishers from publishers
where pub_id in ('P01', 'P05')
order by pub_id")
favPublishers
pub_id pub_name city
1 P01 Abatis Publishers New York
2 P05 AAA Publishing Berkeley
# The following is a cross join. It returns every record from the
# first table (favTitles) paired up with every record from the 2nd table
# (favPublishers).
#
# Notice that the pub_id's on each row of the cross join results do NOT
# necessarily match. Therefore, it may be confusing as to why you should
# use a cross join ... (keep reading)
sqldf("select favTitles.*, favPublishers.*
from favTitles CROSS JOIN favPublishers
")
title_name type pub_id pages pub_id pub_name city
1 But I Did It Unconciously psychology P01 510 P01 Abatis Publishers New York
2 But I Did It Unconciously psychology P01 510 P05 AAA Publishing Berkeley
3 Exchange of Platitudes psychology P01 201 P01 Abatis Publishers New York
4 Exchange of Platitudes psychology P01 201 P05 AAA Publishing Berkeley
5 Not Without My Fabrerge Egg biography P05 NA P01 Abatis Publishers New York
6 Not Without My Fabrerge Egg biography P05 NA P05 AAA Publishing Berkeley
33.5 Every INNER JOIN starts with a CROSS JOIN
# The following adds a where clause to the code. The where clause
# removes those rows from
# the cross join for which the favTitles.pub_id and the favPublishers.pub_id
# don't match. This is really what you want in this case. In order
# to get the correct results, first a CROSS JOIN was done that created
# all possible combinations of a row from favTitles with a row from favPublishers.
# Then the where clause eliminated the rows in the output that didn't make sense.
# This leaves us with rows that show the details for a title and the details
# of the publisher who published that title.
sqldf("select favTitles.*, favPublishers.*
from favTitles CROSS JOIN favPublishers
where favTitles.pub_id = favPublishers.pub_id
")
title_name type pub_id pages pub_id pub_name city
1 But I Did It Unconciously psychology P01 510 P01 Abatis Publishers New York
2 Exchange of Platitudes psychology P01 201 P01 Abatis Publishers New York
3 Not Without My Fabrerge Egg biography P05 NA P05 AAA Publishing Berkeley
# Another way of writing the same query is with an "inner join" (which
# can also be written as just "join"). This version of the query
# uses the word "join" instead of "cross join" and moves the condition
# that ties the tables together from the where clause into the "on" clause.
# This does EXACTLY THE SAME THING as the above query (with CROSS JOIN).
# However, the "join" syntax with its "on" clause makes the details of
# how the two tables are related to each other more obvious.
sqldf("select favTitles.*, favPublishers.*
from favTitles JOIN favPublishers ON favTitles.pub_id = favPublishers.pub_id
")
title_name type pub_id pages pub_id pub_name city
1 But I Did It Unconciously psychology P01 510 P01 Abatis Publishers New York
2 Exchange of Platitudes psychology P01 201 P01 Abatis Publishers New York
3 Not Without My Fabrerge Egg biography P05 NA P05 AAA Publishing Berkeley
33.6 The word INNER in “INNER JOIN” is optional
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# JOIN vs INNER JOIN
#
# The technical term for this type of join is an "INNER JOIN".
# However, you can write either "INNER JOIN" or just "JOIN".
# The following does the same thing as the previous query.
# The only difference in the code is that we added the
# word "INNER".
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
sqldf("select favTitles.*, favPublishers.*
from favTitles INNER JOIN favPublishers ON favTitles.pub_id = favPublishers.pub_id
")
title_name type pub_id pages pub_id pub_name city
1 But I Did It Unconciously psychology P01 510 P01 Abatis Publishers New York
2 Exchange of Platitudes psychology P01 201 P01 Abatis Publishers New York
3 Not Without My Fabrerge Egg biography P05 NA P05 AAA Publishing Berkeley
33.7 Example
# QUESTION
#
# For each publisher, list the name of the publisher and total
# number of pages that the publisher has published in all their books (ie. add
# up all the pages in all books for each publisher). Show the publishers who
# have published the most pages at the top of the result list.
# ANSWER
#
# Step 1 - since publishers contains the pub_name and titles contains the
# number of pages (num_pages) for each book, we need to join those tables.
sqldf("select *
from publishers join titles on publishers.pub_id = titles.pub_id
")
pub_id pub_name city state country title_id title_name type pub_id pages price sales pubdate
1 P01 Abatis Publishers New York NY USA T01 1977! history P01 107 21.99 566 8/1/2000 0:00
2 P01 Abatis Publishers New York NY USA T04 But I Did It Unconciously psychology P01 510 12.99 13001 5/31/1999 0:00
3 P01 Abatis Publishers New York NY USA T05 Exchange of Platitudes psychology P01 201 6.95 201440 1/1/2001 0:00
4 P01 Abatis Publishers New York NY USA T06 How About Never? biography P01 473 19.95 11320 7/31/2000 0:00
5 P01 Abatis Publishers New York NY USA T08 Just Wait Until After School children P01 86 10.00 4095 6/1/2001 0:00
6 P01 Abatis Publishers New York NY USA T09 Kiss My Boo Boo children P01 22 13.95 5000 5/31/2002 0:00
7 P02 Core Dump Books San Francisco CA USA T03 Ask Yor System Administrator computer P02 1226 39.95 25667 9/1/2000 0:00
8 P03 Schandenfreude Press Hamburg <NA> Germany T02 200 Years of German Humor history P03 14 19.95 9566 4/1/1998 0:00
9 P03 Schandenfreude Press Hamburg <NA> Germany T07 I Blame My Mother biography P03 333 23.95 1500200 10/1/1999 0:00
10 P03 Schandenfreude Press Hamburg <NA> Germany T13 What Are The Civilian Applications? history P03 802 29.99 10467 5/31/1999 0:00
11 P05 AAA Publishing Berkeley CA USA T10 Not Without My Fabrerge Egg biography P05 NA NA NA <NA>
12 P05 AAA Publishing Berkeley CA USA T11 Perhaps It's a Glandular Problem psychology P05 826 7.99 94123 11/30/2000 0:00
13 P05 AAA Publishing Berkeley CA USA T12 Spontaneous, Not Annoying biography P05 507 12.99 100001 8/31/2000 0:00
# Step 2 - use group by and sum to add up the pages for each publisher
sqldf("select pub_name, sum(pages)
from publishers join titles on publishers.pub_id = titles.pub_id
group by publishers.pub_id, pub_name
")
pub_name sum(pages)
1 Abatis Publishers 1399
2 Core Dump Books 1226
3 Schandenfreude Press 1149
4 AAA Publishing 1333
# Step 3 - use order by
sqldf("select pub_name, sum(pages)
from publishers join titles on publishers.pub_id = titles.pub_id
group by publishers.pub_id, pub_name
order by sum(pages) desc
")
pub_name sum(pages)
1 Abatis Publishers 1399
2 AAA Publishing 1333
3 Core Dump Books 1226
4 Schandenfreude Press 1149
# REMEMBER - every inner join starts out under the covers as a cross join
# The following two commands are equivalent
sqldf("select au_fname, au_lname
from authors INNER join title_authors on authors.au_id = title_authors.au_id
where title_id = 'T11'
order by au_lname, au_fname")
au_fname au_lname
1 Hallie Hull
2 Klee Hull
3 Harvey Kellsey
sqldf("select au_fname, au_lname
from authors CROSS join title_authors
where authors.au_id = title_authors.au_id
and
title_id = 'T11'
order by au_lname, au_fname")
au_fname au_lname
1 Hallie Hull
2 Klee Hull
3 Harvey Kellsey
33.8 Even more types of JOINs
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# SEVERAL DIFFERENT TYPES OF "JOINS"
#
# NOTE: There are actually several different types of "joins". They all do
# slightly different things. For example:
#
# CROSS JOIN - see the top of this file
# INNER JOIN (or just JOIN) - we just described this above
# LEFT OUTER JOIN (or just LEFT JOIN) - we'll cover this later
# RIGHT OUTER JOIN (or just RIGHT JOIN) - we'll cover this later
# FULL OUTER JOIN (or just FULL JOIN) - we'll cover this later
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# INNER JOIN ( or just "JOIN" )
#
# The following is an "inner join" (or just "join").
# (We already covered this above. It's repeated here for completeness).
sqldf("select title_name, pub_name
from favTitles JOIN favPublishers ON favTitles.pub_id = favPublishers.pub_id")
title_name pub_name
1 But I Did It Unconciously Abatis Publishers
2 Exchange of Platitudes Abatis Publishers
3 Not Without My Fabrerge Egg AAA Publishing
33.9 “old” ANSI SQL way of doing inner joins
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ANSI SQL - American National Standards Institute
#
# There are different versions of the ANSI SQL standard. The versions are
# named by the year that the version came out. See this for a full history of SQL:
# https://learnsql.com/blog/history-of-sql-standards/
#
# Joins were done differently before SQL 92
# They CAN still be done the "old way" (and often are). Some people prefer
# the "old way" since there is less to type.
#
# The "old way " for a cross join is to not specify cross join
# at all. Just list the tables with commas between them.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
sqldf("
select favTitles.*, favPublishers.*
from favTitles, favPublishers
")
title_name type pub_id pages pub_id pub_name city
1 But I Did It Unconciously psychology P01 510 P01 Abatis Publishers New York
2 But I Did It Unconciously psychology P01 510 P05 AAA Publishing Berkeley
3 Exchange of Platitudes psychology P01 201 P01 Abatis Publishers New York
4 Exchange of Platitudes psychology P01 201 P05 AAA Publishing Berkeley
5 Not Without My Fabrerge Egg biography P05 NA P01 Abatis Publishers New York
6 Not Without My Fabrerge Egg biography P05 NA P05 AAA Publishing Berkeley
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# The "old way" of doing an "inner join" (i.e. a "regular - join")
# is to do a "cross join" the old way and then add a where clause
# that does the job of the "on".
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
sqldf("
select favTitles.*, favPublishers.*
from favTitles, favPublishers
where favTitles.pub_id = favPublishers.pub_id
")
title_name type pub_id pages pub_id pub_name city
1 But I Did It Unconciously psychology P01 510 P01 Abatis Publishers New York
2 Exchange of Platitudes psychology P01 201 P01 Abatis Publishers New York
3 Not Without My Fabrerge Egg biography P05 NA P05 AAA Publishing Berkeley
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# in the "old way" of doing inner joins, you could add additional
# conditions to the where clause just like in the "new way" of
# writing queries.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
sqldf("
select favTitles.*, favPublishers.*
from favTitles, favPublishers
where favTitles.pub_id = favPublishers.pub_id and
pages >= 300
")
title_name type pub_id pages pub_id pub_name city
1 But I Did It Unconciously psychology P01 510 P01 Abatis Publishers New York
33.10 LEFT JOIN
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# LEFT OUTER JOIN (or just "LEFT JOIN")
#
# Before learning about "LEFT JOIN", you should first understand one
# issue that comes up when using "INNER JOIN"s.
#
# This issue is best shown by way of an example.
# Publisher, P04 - Tenterhooks Press, does not have any corresponding rows in
# the titles table (perhaps it's a new publishing company that is just getting
# started). For this reason when you use an "inner join" to join the titles
# table and the publishers table you will NOT see any rows for Tenterhooks press.
# The following "inner join" (or just "join") shows the output below.
# Tenterhooks Press does not appear in the output at all.
#
# > sqldf("select pub_name, title_name
# from publishers JOIN titles on publishers.pub_id = titles.pub_id
# order by pub_name, title_name")
#
# pub_name title_name
# 1 AAA Publishing Not Without My Fabrerge Egg
# 2 AAA Publishing Perhaps It's a Glandular Problem
# 3 AAA Publishing Spontaneous, Not Annoying
# 4 Abatis Publishers 1977!
# 5 Abatis Publishers But I Did It Unconciously
# 6 Abatis Publishers Exchange of Platitudes
# 7 Abatis Publishers How About Never?
# 8 Abatis Publishers Just Wait Until After School
# 9 Abatis Publishers Kiss My Boo Boo
# 10 Core Dump Books Ask Yor System Administrator
# 11 Schandenfreude Press 200 Years of German Humor
# 12 Schandenfreude Press I Blame My Mother
# 13 Schandenfreude Press What Are The Civilian Applications?
#
#
# However, suppose you want to see the same output, but you'd also like
# to see Tenterhooks Press, even though they haven't actually published
# any titles yet? .... that is where a LEFT JOIN is useful.
#
# The output of a "left join" contains all of the rows of an "inner join"
# plus possibly a few more rows that would not appear in the output of
# an inner join.
# By simply changing the "inner join" (i.e. "join") in the previous
# query to "left join" (or "left outer join") we get the following results.
# Notice that Tenterhooks Press now DOES appear in a single row.
# All columns that would have come from the corresponding rows in the titles
# table are NULL (i.e. NA in R dataframes) for the Tenterhooks Press row.
# > sqldf("select pub_name, title_name
# from publishers LEFT JOIN titles on publishers.pub_id = titles.pub_id
# order by pub_name, title_name")
#
# pub_name title_name
# 1 AAA Publishing Not Without My Fabrerge Egg
# 2 AAA Publishing Perhaps It's a Glandular Problem
# 3 AAA Publishing Spontaneous, Not Annoying
# 4 Abatis Publishers 1977!
# 5 Abatis Publishers But I Did It Unconciously
# 6 Abatis Publishers Exchange of Platitudes
# 7 Abatis Publishers How About Never?
# 8 Abatis Publishers Just Wait Until After School
# 9 Abatis Publishers Kiss My Boo Boo
# 10 Core Dump Books Ask Yor System Administrator
# 11 Schandenfreude Press 200 Years of German Humor
# 12 Schandenfreude Press I Blame My Mother
# 13 Schandenfreude Press What Are The Civilian Applications?
# 14 Tneterhooks Press <NA>
#
#
# WHAT DOES "LEFT" MEAN ???
#
# The word "left" and "right" in "left join" and "right join" refer to the
# location of the table names in the SQL code. Specifically, the text of
# the code of every JOIN has a table name to the LEFT of the word "JOIN" and
# a table name to the right of the word "JOIN". When you specify
#
# "... tableA LEFT JOIN tableB ON ..."
#
# tableA is the "left" table and
# tableB is the "right" table
#
# The LEFT JOIN syntax says that if there are rows in the left table (tableA
# in the example above) that don't have corresponding rows in the "right" table
# (tableB in the example above) then those rows from tableA should still
# appear in the output. Any columns that would have come from tableB will
# have NULL values (i.e. NA in R dataframes) in that row.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# The following is a "regular join" (i.e. an "inner join")
#
# Tenterhooks press does not show up since they have no titles
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
sqldf("select titles.title_name, pub_name, publishers.pub_id
from titles join publishers on titles.pub_id = publishers.pub_id
order by pub_name, title_id")
title_name pub_name pub_id
1 Not Without My Fabrerge Egg AAA Publishing P05
2 Perhaps It's a Glandular Problem AAA Publishing P05
3 Spontaneous, Not Annoying AAA Publishing P05
4 1977! Abatis Publishers P01
5 But I Did It Unconciously Abatis Publishers P01
6 Exchange of Platitudes Abatis Publishers P01
7 How About Never? Abatis Publishers P01
8 Just Wait Until After School Abatis Publishers P01
9 Kiss My Boo Boo Abatis Publishers P01
10 Ask Yor System Administrator Core Dump Books P02
11 200 Years of German Humor Schandenfreude Press P03
12 I Blame My Mother Schandenfreude Press P03
13 What Are The Civilian Applications? Schandenfreude Press P03
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# The following is a "left join"
#
# Tenterhooks press DOES show up since a left join will add to
# the results of an inner join, any rows that appear in the "left"
# hand table that have no corresponding rows in the right hand table.
# In the result set, the columns from the right hand table in these
# rows are NULL (or "NA" when using sqldf in R)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
sqldf("select titles.title_name, pub_name, publishers.pub_id
from publishers left join titles on titles.pub_id = publishers.pub_id
order by pub_name, title_id")
title_name pub_name pub_id
1 Not Without My Fabrerge Egg AAA Publishing P05
2 Perhaps It's a Glandular Problem AAA Publishing P05
3 Spontaneous, Not Annoying AAA Publishing P05
4 1977! Abatis Publishers P01
5 But I Did It Unconciously Abatis Publishers P01
6 Exchange of Platitudes Abatis Publishers P01
7 How About Never? Abatis Publishers P01
8 Just Wait Until After School Abatis Publishers P01
9 Kiss My Boo Boo Abatis Publishers P01
10 Ask Yor System Administrator Core Dump Books P02
11 200 Years of German Humor Schandenfreude Press P03
12 I Blame My Mother Schandenfreude Press P03
13 What Are The Civilian Applications? Schandenfreude Press P03
14 <NA> Tneterhooks Press P04
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ALL of the columns from the right hand table contain NULLs (i.e. NA)
# for the extra rows that are added by the left join
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# To prove that point, the following shows all of the columns from both
# the left hand table (publishers) and the right hand table (titles)
# Notice that ALL of the columns for the right hand table will have
# NULLs (i.e. NA in sqldf in R) for those rows of the output that
# contain data from the left hand table for which there is no corresponding
# row in the right hand table
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
sqldf("select publishers.*, titles.*
from publishers left join titles on titles.pub_id = publishers.pub_id
order by pub_name, title_id")
pub_id pub_name city state country title_id title_name type pub_id pages price sales pubdate
1 P05 AAA Publishing Berkeley CA USA T10 Not Without My Fabrerge Egg biography P05 NA NA NA <NA>
2 P05 AAA Publishing Berkeley CA USA T11 Perhaps It's a Glandular Problem psychology P05 826 7.99 94123 11/30/2000 0:00
3 P05 AAA Publishing Berkeley CA USA T12 Spontaneous, Not Annoying biography P05 507 12.99 100001 8/31/2000 0:00
4 P01 Abatis Publishers New York NY USA T01 1977! history P01 107 21.99 566 8/1/2000 0:00
5 P01 Abatis Publishers New York NY USA T04 But I Did It Unconciously psychology P01 510 12.99 13001 5/31/1999 0:00
6 P01 Abatis Publishers New York NY USA T05 Exchange of Platitudes psychology P01 201 6.95 201440 1/1/2001 0:00
7 P01 Abatis Publishers New York NY USA T06 How About Never? biography P01 473 19.95 11320 7/31/2000 0:00
8 P01 Abatis Publishers New York NY USA T08 Just Wait Until After School children P01 86 10.00 4095 6/1/2001 0:00
9 P01 Abatis Publishers New York NY USA T09 Kiss My Boo Boo children P01 22 13.95 5000 5/31/2002 0:00
10 P02 Core Dump Books San Francisco CA USA T03 Ask Yor System Administrator computer P02 1226 39.95 25667 9/1/2000 0:00
11 P03 Schandenfreude Press Hamburg <NA> Germany T02 200 Years of German Humor history P03 14 19.95 9566 4/1/1998 0:00
12 P03 Schandenfreude Press Hamburg <NA> Germany T07 I Blame My Mother biography P03 333 23.95 1500200 10/1/1999 0:00
13 P03 Schandenfreude Press Hamburg <NA> Germany T13 What Are The Civilian Applications? history P03 802 29.99 10467 5/31/1999 0:00
14 P04 Tneterhooks Press Berkeley CA USA <NA> <NA> <NA> <NA> NA NA NA <NA>
# BE CAREFUL ...
#
# When doing an "inner join" (i.e. a "join") it doesn't make a difference
# which table you type to the left of the word "join" and which to the right.
#
# HOWEVER - for "LEFT JOIN"s it DOES MATTER!!!
#
# If we modify the previous query and put the publishers table on the
# right side of the "LEFT JOIN" keyword, we will again NOT see Teneterhooks Press
# since a LEFT JOIN only adds extra rows to the output for the rows in the
# LEFT hand table that don't have corresponding rows in the right hand table.
# Tenterhooks Press does NOT appear since the publishers table is on the
# right hand side of the "LEFT JOIN" keywords
sqldf("select titles.title_name, pub_name, publishers.pub_id
from titles left join publishers on titles.pub_id = publishers.pub_id
order by pub_name, title_id")
title_name pub_name pub_id
1 Not Without My Fabrerge Egg AAA Publishing P05
2 Perhaps It's a Glandular Problem AAA Publishing P05
3 Spontaneous, Not Annoying AAA Publishing P05
4 1977! Abatis Publishers P01
5 But I Did It Unconciously Abatis Publishers P01
6 Exchange of Platitudes Abatis Publishers P01
7 How About Never? Abatis Publishers P01
8 Just Wait Until After School Abatis Publishers P01
9 Kiss My Boo Boo Abatis Publishers P01
10 Ask Yor System Administrator Core Dump Books P02
11 200 Years of German Humor Schandenfreude Press P03
12 I Blame My Mother Schandenfreude Press P03
13 What Are The Civilian Applications? Schandenfreude Press P03
# Lets add a row to the titles table that doesn't have a correponding row
# in the publishers table.
= data.frame(title_id='014', title_name='Data Everywhere', type='computer', pub_id='P99', pages='200', price='19.99', sales=0, pubdate='1/01/2024')
newBook = rbind(titles, newBook)
titles2 titles2
# A tibble: 14 × 8
title_id title_name type pub_id pages price sales pubdate
<chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr>
1 T01 1977! history P01 107 21.99 566 8/1/2000 0:00
2 T02 200 Years of German Humor history P03 14 19.95 9566 4/1/1998 0:00
3 T03 Ask Yor System Administrator computer P02 1226 39.95 25667 9/1/2000 0:00
4 T04 But I Did It Unconciously psychology P01 510 12.99 13001 5/31/1999 0:00
5 T05 Exchange of Platitudes psychology P01 201 6.95 201440 1/1/2001 0:00
6 T06 How About Never? biography P01 473 19.95 11320 7/31/2000 0:00
7 T07 I Blame My Mother biography P03 333 23.95 1500200 10/1/1999 0:00
8 T08 Just Wait Until After School children P01 86 10 4095 6/1/2001 0:00
9 T09 Kiss My Boo Boo children P01 22 13.95 5000 5/31/2002 0:00
10 T10 Not Without My Fabrerge Egg biography P05 <NA> <NA> NA <NA>
11 T11 Perhaps It's a Glandular Problem psychology P05 826 7.99 94123 11/30/2000 0:00
12 T12 Spontaneous, Not Annoying biography P05 507 12.99 100001 8/31/2000 0:00
13 T13 What Are The Civilian Applications? history P03 802 29.99 10467 5/31/1999 0:00
14 014 Data Everywhere computer P99 200 19.99 0 1/01/2024
sqldf("select titles2.title_name, pub_name, publishers.pub_id
from publishers left join titles2 on titles2.pub_id = publishers.pub_id
order by pub_name, title_id")
title_name pub_name pub_id
1 Not Without My Fabrerge Egg AAA Publishing P05
2 Perhaps It's a Glandular Problem AAA Publishing P05
3 Spontaneous, Not Annoying AAA Publishing P05
4 1977! Abatis Publishers P01
5 But I Did It Unconciously Abatis Publishers P01
6 Exchange of Platitudes Abatis Publishers P01
7 How About Never? Abatis Publishers P01
8 Just Wait Until After School Abatis Publishers P01
9 Kiss My Boo Boo Abatis Publishers P01
10 Ask Yor System Administrator Core Dump Books P02
11 200 Years of German Humor Schandenfreude Press P03
12 I Blame My Mother Schandenfreude Press P03
13 What Are The Civilian Applications? Schandenfreude Press P03
14 <NA> Tneterhooks Press P04
sqldf("select titles2.title_name, pub_name, publishers.pub_id
from titles2 left join publishers on titles2.pub_id = publishers.pub_id
order by pub_name, title_id")
title_name pub_name pub_id
1 Data Everywhere <NA> <NA>
2 Not Without My Fabrerge Egg AAA Publishing P05
3 Perhaps It's a Glandular Problem AAA Publishing P05
4 Spontaneous, Not Annoying AAA Publishing P05
5 1977! Abatis Publishers P01
6 But I Did It Unconciously Abatis Publishers P01
7 Exchange of Platitudes Abatis Publishers P01
8 How About Never? Abatis Publishers P01
9 Just Wait Until After School Abatis Publishers P01
10 Kiss My Boo Boo Abatis Publishers P01
11 Ask Yor System Administrator Core Dump Books P02
12 200 Years of German Humor Schandenfreude Press P03
13 I Blame My Mother Schandenfreude Press P03
14 What Are The Civilian Applications? Schandenfreude Press P03
33.11 Practice
##########################################################################
# PRACTICE QUESTION
#
# Write a query that shows the publisher name (i.e. pub_name) for those publishers
# (and only those publishers)) who have no titles in the titles table.
##############################################################################
# Remember this query
sqldf("select publishers.*, titles.*
from publishers left join titles on publishers.pub_id = titles.pub_id
")
pub_id pub_name city state country title_id title_name type pub_id pages price sales pubdate
1 P01 Abatis Publishers New York NY USA T01 1977! history P01 107 21.99 566 8/1/2000 0:00
2 P01 Abatis Publishers New York NY USA T04 But I Did It Unconciously psychology P01 510 12.99 13001 5/31/1999 0:00
3 P01 Abatis Publishers New York NY USA T05 Exchange of Platitudes psychology P01 201 6.95 201440 1/1/2001 0:00
4 P01 Abatis Publishers New York NY USA T06 How About Never? biography P01 473 19.95 11320 7/31/2000 0:00
5 P01 Abatis Publishers New York NY USA T08 Just Wait Until After School children P01 86 10.00 4095 6/1/2001 0:00
6 P01 Abatis Publishers New York NY USA T09 Kiss My Boo Boo children P01 22 13.95 5000 5/31/2002 0:00
7 P02 Core Dump Books San Francisco CA USA T03 Ask Yor System Administrator computer P02 1226 39.95 25667 9/1/2000 0:00
8 P03 Schandenfreude Press Hamburg <NA> Germany T02 200 Years of German Humor history P03 14 19.95 9566 4/1/1998 0:00
9 P03 Schandenfreude Press Hamburg <NA> Germany T07 I Blame My Mother biography P03 333 23.95 1500200 10/1/1999 0:00
10 P03 Schandenfreude Press Hamburg <NA> Germany T13 What Are The Civilian Applications? history P03 802 29.99 10467 5/31/1999 0:00
11 P04 Tneterhooks Press Berkeley CA USA <NA> <NA> <NA> <NA> NA NA NA <NA>
12 P05 AAA Publishing Berkeley CA USA T10 Not Without My Fabrerge Egg biography P05 NA NA NA <NA>
13 P05 AAA Publishing Berkeley CA USA T11 Perhaps It's a Glandular Problem psychology P05 826 7.99 94123 11/30/2000 0:00
14 P05 AAA Publishing Berkeley CA USA T12 Spontaneous, Not Annoying biography P05 507 12.99 100001 8/31/2000 0:00
# add a where clause as shown below to get ONLY those rows from the left table that have no matching rows in the right table
sqldf("select publishers.*, titles.*
from publishers left join titles on publishers.pub_id = titles.pub_id
where titles.type is null
")
pub_id pub_name city state country title_id title_name type pub_id pages price sales pubdate
1 P04 Tneterhooks Press Berkeley CA USA <NA> <NA> <NA> <NA> NA NA NA <NA>
# The following works, but it isn't ideal .... because titles.type
# is not the best column to choose to test for null (keep reading for an explanation ...)
sqldf("select pub_name
from publishers left join titles on publishers.pub_id = titles.pub_id
where titles.type is null
")
pub_name
1 Tneterhooks Press
# To explain why titles.type was not the best column to choose above compare
# the query below where we used titles.pages to test for NULL.
# Since there is a row in titles table that contains NULL for pages that row
# causes the publisher for that row (i..e AAA Publishing) to be displayed
# in the output.
sqldf("select pub_name
from publishers left join titles on publishers.pub_id = titles.pub_id
where titles.pages is null
")
pub_name
1 Tneterhooks Press
2 AAA Publishing
#----------------------------------------------------------------------------
# The best way to identify only those rows that were added due to the
# "left" join is to look for NULL in a column that would otherwise never
# contain a NULL.
#
# Best choices are
# - a primary key field (i.e. column)
# - or another column that is defined in the database to never allow NULLs
#----------------------------------------------------------------------------
# The best way to do the same query is by checking the primary key of the right
# hand table for NULL in the where clause.
sqldf("select pub_name
from publishers left join titles on publishers.pub_id = titles.pub_id
where titles.title_id is null
")
pub_name
1 Tneterhooks Press
33.12 RIGHT JOIN and FULL JOIN
###############################################################.
# RIGHT JOIN and FULL JOIN
#
# LEFT JOIN is also known as a LEFT OUTER JOIN
# RIGHT JOIN is also known as a RIGHT OUTER JOIN
# FULL JOIN is also known as a FULL OUTER JOIN
#
# These are all known as "OUTER" joins
###############################################################.
# ANSI SQL also has "right join" and "full join"
#
# A right join does the same thing as a left join except that
# "right join" shows rows in the output that correspond to rows from the
# right hand table that don't have corresponding rows in the left hand table.
# The following LEFT JOIN ...
sqldf("select publishers.*, titles.*
from publishers left join titles on titles.pub_id = publishers.pub_id
order by pub_name, title_id")
pub_id pub_name city state country title_id title_name type pub_id pages price sales pubdate
1 P05 AAA Publishing Berkeley CA USA T10 Not Without My Fabrerge Egg biography P05 NA NA NA <NA>
2 P05 AAA Publishing Berkeley CA USA T11 Perhaps It's a Glandular Problem psychology P05 826 7.99 94123 11/30/2000 0:00
3 P05 AAA Publishing Berkeley CA USA T12 Spontaneous, Not Annoying biography P05 507 12.99 100001 8/31/2000 0:00
4 P01 Abatis Publishers New York NY USA T01 1977! history P01 107 21.99 566 8/1/2000 0:00
5 P01 Abatis Publishers New York NY USA T04 But I Did It Unconciously psychology P01 510 12.99 13001 5/31/1999 0:00
6 P01 Abatis Publishers New York NY USA T05 Exchange of Platitudes psychology P01 201 6.95 201440 1/1/2001 0:00
7 P01 Abatis Publishers New York NY USA T06 How About Never? biography P01 473 19.95 11320 7/31/2000 0:00
8 P01 Abatis Publishers New York NY USA T08 Just Wait Until After School children P01 86 10.00 4095 6/1/2001 0:00
9 P01 Abatis Publishers New York NY USA T09 Kiss My Boo Boo children P01 22 13.95 5000 5/31/2002 0:00
10 P02 Core Dump Books San Francisco CA USA T03 Ask Yor System Administrator computer P02 1226 39.95 25667 9/1/2000 0:00
11 P03 Schandenfreude Press Hamburg <NA> Germany T02 200 Years of German Humor history P03 14 19.95 9566 4/1/1998 0:00
12 P03 Schandenfreude Press Hamburg <NA> Germany T07 I Blame My Mother biography P03 333 23.95 1500200 10/1/1999 0:00
13 P03 Schandenfreude Press Hamburg <NA> Germany T13 What Are The Civilian Applications? history P03 802 29.99 10467 5/31/1999 0:00
14 P04 Tneterhooks Press Berkeley CA USA <NA> <NA> <NA> <NA> NA NA NA <NA>
# ... will generate the EXACT SAME OUTPUT as the following RIGHT JOIN
sqldf("select publishers.*, titles.*
from titles right join publishers on titles.pub_id = publishers.pub_id
order by pub_name, title_id")
pub_id pub_name city state country title_id title_name type pub_id pages price sales pubdate
1 P05 AAA Publishing Berkeley CA USA T10 Not Without My Fabrerge Egg biography P05 NA NA NA <NA>
2 P05 AAA Publishing Berkeley CA USA T11 Perhaps It's a Glandular Problem psychology P05 826 7.99 94123 11/30/2000 0:00
3 P05 AAA Publishing Berkeley CA USA T12 Spontaneous, Not Annoying biography P05 507 12.99 100001 8/31/2000 0:00
4 P01 Abatis Publishers New York NY USA T01 1977! history P01 107 21.99 566 8/1/2000 0:00
5 P01 Abatis Publishers New York NY USA T04 But I Did It Unconciously psychology P01 510 12.99 13001 5/31/1999 0:00
6 P01 Abatis Publishers New York NY USA T05 Exchange of Platitudes psychology P01 201 6.95 201440 1/1/2001 0:00
7 P01 Abatis Publishers New York NY USA T06 How About Never? biography P01 473 19.95 11320 7/31/2000 0:00
8 P01 Abatis Publishers New York NY USA T08 Just Wait Until After School children P01 86 10.00 4095 6/1/2001 0:00
9 P01 Abatis Publishers New York NY USA T09 Kiss My Boo Boo children P01 22 13.95 5000 5/31/2002 0:00
10 P02 Core Dump Books San Francisco CA USA T03 Ask Yor System Administrator computer P02 1226 39.95 25667 9/1/2000 0:00
11 P03 Schandenfreude Press Hamburg <NA> Germany T02 200 Years of German Humor history P03 14 19.95 9566 4/1/1998 0:00
12 P03 Schandenfreude Press Hamburg <NA> Germany T07 I Blame My Mother biography P03 333 23.95 1500200 10/1/1999 0:00
13 P03 Schandenfreude Press Hamburg <NA> Germany T13 What Are The Civilian Applications? history P03 802 29.99 10467 5/31/1999 0:00
14 P04 Tneterhooks Press Berkeley CA USA <NA> <NA> <NA> <NA> NA NA NA <NA>
# HOWEVER - SQLite does not implement RIGHT JOIN even though it is part of the ANSI Standard for SQL
# Similarly a "full join" shows rows in the output for BOTH
# - rows in the left hand table that don't have corresponding rows in the right hand table and
# - rows in the right hand table that don't have corresponding rows in the left hand table and
#
# SQLite - the database that are using does not implement RIGHT JOINs or FULL JOINs
# However, there are simple workarounds for both.
# The workaround for a right join is simply to use a left join and type
# the table names in the opposite order.
#
# The workaround for FULL JOIN requires you to understand "UNION" ... see below
##########################################################################.
### UPDATE !!!
###
### As of now (in 2024) sqlite has implemented both RIGHT JOIN
### and FULL JOIN. I will update these notes with that info soon.
##########################################################################.
33.13 SQL set operations (i.e. UNION, EXCEPT, INTERSECT)
##################################################
# Combining the output of 2 queries
#
# UNION
# UNION ALL
# EXCEPT
# INTERSECT
##################################################
# The following is a query that you should already understand.
# There is nothing new here. We will see below how to rewrite this
# query in a different way by using "UNION"
#
# The following displays titles that have fewer than 10 pages
# and also titles that have more than 20 pages.
sqldf("select title_name, price
from titles
where price < 10 or price > 20
order by price")
title_name price
1 Exchange of Platitudes 6.95
2 Perhaps It's a Glandular Problem 7.99
3 1977! 21.99
4 I Blame My Mother 23.95
5 What Are The Civilian Applications? 29.99
6 Ask Yor System Administrator 39.95
33.13.1 UNION
#----------------------------------------------------------------------
# UNION
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# union is used to combine two different queries into one output set.
# The 2 queries must have the same number of columns and the same type
# of columns.
#
# The order by is done after both queries and combines the rows from
# the two select statements in the specified order.
#----------------------------------------------------------------------
# First ... let's see each of the queries on their own ...
# get the titles that are < 10 dollars
sqldf("select title_name, price
from titles
where price < 10
")
title_name price
1 Exchange of Platitudes 6.95
2 Perhaps It's a Glandular Problem 7.99
# get the titles that are > 20 dollars
sqldf("select title_name, price
from titles
where price > 20
")
title_name price
1 1977! 21.99
2 Ask Yor System Administrator 39.95
3 I Blame My Mother 23.95
4 What Are The Civilian Applications? 29.99
# Now, let's see how to use
# UNION to combine the two previous queries into a single output.
sqldf("select title_name, price
from titles
where price < 10
union
select title_name, price
from titles
where price > 20
order by price")
title_name price
1 Exchange of Platitudes 6.95
2 Perhaps It's a Glandular Problem 7.99
3 1977! 21.99
4 I Blame My Mother 23.95
5 What Are The Civilian Applications? 29.99
6 Ask Yor System Administrator 39.95
# Do the same but order by title_name
sqldf("select title_name, price
from titles
where price < 10
union
select title_name, price
from titles
where price > 20
order by title_name")
title_name price
1 1977! 21.99
2 Ask Yor System Administrator 39.95
3 Exchange of Platitudes 6.95
4 I Blame My Mother 23.95
5 Perhaps It's a Glandular Problem 7.99
6 What Are The Civilian Applications? 29.99
# Another example
# Show all book titles for which the price is > 20 or the pages is > 500
sqldf("select title_name, price, pages
from titles
where price > 20 or pages > 500
order by price")
title_name price pages
1 Perhaps It's a Glandular Problem 7.99 826
2 But I Did It Unconciously 12.99 510
3 Spontaneous, Not Annoying 12.99 507
4 1977! 21.99 107
5 I Blame My Mother 23.95 333
6 What Are The Civilian Applications? 29.99 802
7 Ask Yor System Administrator 39.95 1226
33.13.2 UNION ALL
#----------------------------------------------------------
# UNION removes duplicate rows from the output
#
# UNION ALL does NOT remove duplicate rows
#----------------------------------------------------------
# Union removes duplicate rows from the output
#
# Notice that some books match both conditions but only come up in the
# results once.
sqldf("select title_name, price, pages
from titles
where price > 20
union
select title_name, price, pages
from titles
where pages > 500
order by price")
title_name price pages
1 Perhaps It's a Glandular Problem 7.99 826
2 But I Did It Unconciously 12.99 510
3 Spontaneous, Not Annoying 12.99 507
4 1977! 21.99 107
5 I Blame My Mother 23.95 333
6 What Are The Civilian Applications? 29.99 802
7 Ask Yor System Administrator 39.95 1226
# UNION ALL
#
# Running the same query as above but typing "UNION ALL"
# Will show the rows that meet both conditions, twice.
sqldf("select title_name, price, pages
from titles
where price > 20
union all
select title_name, price, pages
from titles
where pages > 500
order by price")
title_name price pages
1 Perhaps It's a Glandular Problem 7.99 826
2 But I Did It Unconciously 12.99 510
3 Spontaneous, Not Annoying 12.99 507
4 1977! 21.99 107
5 I Blame My Mother 23.95 333
6 What Are The Civilian Applications? 29.99 802
7 What Are The Civilian Applications? 29.99 802
8 Ask Yor System Administrator 39.95 1226
9 Ask Yor System Administrator 39.95 1226
33.13.3 Practice
# PRACTICE QUESTION
#
# Show the names of all authors and publishers and the city that they are from.
# Indicate if the row in the output is for a 'publisher' or for an 'author'.
# For authors dispaly the name of the author as FIRST LAST in a single column.
# This column should be called 'name' - this is the same column as
# the publisher's name would appear.
# Sort the results by the name.
#
# NAME CITY Pub_or_author
# Joe Smith New York Author
# AAA pubs Seattle Publisher
# Sue Cohen LA Author
# etc.
# We can create two queries, one for authors and one for publishers
sqldf("select au_fname || ' ' || au_lname as name, city, 'Author' as 'pub_or_author'
from authors")
name city pub_or_author
1 Sarah Buchman Bronx Author
2 Wendy Heydemark Boulder Author
3 Hallie Hull San Francisco Author
4 Klee Hull San Francisco Author
5 Christian Kells New York Author
6 Harvey Kellsey Palo Alto Author
7 Paddy O'Furniture Sarasota Author
sqldf("select pub_name, city, 'Publisher' as 'pub_or_author'
from publishers")
pub_name city pub_or_author
1 Abatis Publishers New York Publisher
2 Core Dump Books San Francisco Publisher
3 Schandenfreude Press Hamburg Publisher
4 Tneterhooks Press Berkeley Publisher
5 AAA Publishing Berkeley Publisher
# Then combine the two queries with union
sqldf("select au_fname || ' ' || au_lname as name, city, 'Author' as 'pub_or_author'
from authors
union
select pub_name, city, 'Publisher' as 'pub_or_author'
from publishers
order by 1")
name city pub_or_author
1 AAA Publishing Berkeley Publisher
2 Abatis Publishers New York Publisher
3 Christian Kells New York Author
4 Core Dump Books San Francisco Publisher
5 Hallie Hull San Francisco Author
6 Harvey Kellsey Palo Alto Author
7 Klee Hull San Francisco Author
8 Paddy O'Furniture Sarasota Author
9 Sarah Buchman Bronx Author
10 Schandenfreude Press Hamburg Publisher
11 Tneterhooks Press Berkeley Publisher
12 Wendy Heydemark Boulder Author
33.13.4 EXCEPT
###################################
# EXCEPT
###################################
# QUESTION
# Show the publishers who publish biography books but not history books
# ANSWER
# Let's start by first writing a query that gets those publishers who have
# published biographies (spoiler alert ... we are later going to
# remove from this list those publishers who have published history books)
sqldf("select distinct pub_name
from publishers join titles on publishers.pub_id = titles.pub_id
where titles.type = 'biography'")
pub_name
1 Abatis Publishers
2 Schandenfreude Press
3 AAA Publishing
# If you take out the distinct you will get duplicate copies of publisher names
sqldf("select pub_name
from publishers join titles on publishers.pub_id = titles.pub_id
where titles.type = 'biography'")
pub_name
1 Abatis Publishers
2 Schandenfreude Press
3 AAA Publishing
4 AAA Publishing
# You get the duplicates for publishers who published more than one biography
#
# You can see that by looking at all of the data, not just the publisher name.
sqldf("select publishers.*, titles.*
from publishers join titles on publishers.pub_id = titles.pub_id
where titles.type = 'biography'")
pub_id pub_name city state country title_id title_name type pub_id pages price sales pubdate
1 P01 Abatis Publishers New York NY USA T06 How About Never? biography P01 473 19.95 11320 7/31/2000 0:00
2 P03 Schandenfreude Press Hamburg <NA> Germany T07 I Blame My Mother biography P03 333 23.95 1500200 10/1/1999 0:00
3 P05 AAA Publishing Berkeley CA USA T10 Not Without My Fabrerge Egg biography P05 NA NA NA <NA>
4 P05 AAA Publishing Berkeley CA USA T12 Spontaneous, Not Annoying biography P05 507 12.99 100001 8/31/2000 0:00
# Once again, this is the query that shows publishers who published biographies
sqldf("select distinct pub_name
from publishers join titles on publishers.pub_id = titles.pub_id
where titles.type = 'biography'")
pub_name
1 Abatis Publishers
2 Schandenfreude Press
3 AAA Publishing
# This is the query for publishers who published history books
sqldf("select distinct pub_name
from publishers join titles on publishers.pub_id = titles.pub_id
where titles.type = 'history'")
pub_name
1 Abatis Publishers
2 Schandenfreude Press
# Now show publishers who published biograhpies BUT NOT history books
sqldf("select distinct pub_name
from publishers join titles on publishers.pub_id = titles.pub_id
where titles.type = 'biography'
EXCEPT
select distinct pub_name
from publishers join titles on publishers.pub_id = titles.pub_id
where titles.type = 'history'")
pub_name
1 AAA Publishing
33.13.5 INTERSECT
###################################
# INTERSECT
###################################
#---------------------------------------------------------------------
# Show the cities that have both publishers and authors residing there.
#---------------------------------------------------------------------
# show the cities that have publishers
sqldf("select distinct city, state
from publishers
order by city
")
city state
1 Berkeley CA
2 Hamburg <NA>
3 New York NY
4 San Francisco CA
# Show the cities that have authors
sqldf("select distinct city, state
from authors
order by city")
city state
1 Boulder CO
2 Bronx NY
3 New York NY
4 Palo Alto CA
5 San Francisco CA
6 Sarasota FL
# Combine the two queries with INTERSECT.
# This will only show rows that are output from both of the
# select statements.
sqldf(" select distinct city, state
from publishers
INTERSECT
select distinct city, state
from authors
order by city")
city state
1 New York NY
2 San Francisco CA
33.14 Subqueries
A “subquery” is a select statement that appears inside of another select statement. The larger select statement is often referred to as the “outer query” and the subquery is often referred to as the “inner query”. This will become much more understandable after seeing some of the examples shown below.
Note that a single outer query may contain several different subqueries. Also note that a subquery may itself contain yet another subquery.
Subqueries are most often found in the SELECT, FROM, WHERE and HAVING clauses of a SQL select statement. The easiest types of subqueries to understand are probably those that appear in the WHERE or HAVING clauses of a larger select statement. Below are some examples. We will cover other types of subqueries after these.
33.14.1 Subqueries in WHERE and HAVING clauses
Subqueries that appear in the WHERE or HAVING clause can be categorized by the “shape” (ie. number of rows and columns that are returned by the subquery).
subqueries that return just one value (i.e. one row of one column)
These types of subqueries can be placed anywhere in the outer query that you normally would put a single value (e.g. before or after an operator such as a “>” sign or “-” sign).
subqueries that return just on column but possibly many rows.
These types of subqueries are used to generate “lists” of value that can appear after the SQL IN operator (see below for an example).
subqueries that return any number of rows and any number of columns.
These types of subqueries may appear after the SQL keyword “EXISTS” which we discuss below.
Subqueries that return a single value
In the following example, the subqueries appear after a “>” and after a “<”. Therefore the subquery MUST return just a single value.
QUESTION (a) Show titles whose price is greater than the average price of all titles.
- Show titles whose price is less than the average price of all titles.
ANSWER (a)
# Titles whose price is greater than the average price of all titles
sqldf("select title_name, price
from titles
where price >
(
select avg(price)
from titles
)
order by title_name
")
title_name price
1 1977! 21.99
2 200 Years of German Humor 19.95
3 Ask Yor System Administrator 39.95
4 How About Never? 19.95
5 I Blame My Mother 23.95
6 What Are The Civilian Applications? 29.99
ANSWER (b)
# Titles whose price is less than the average price of all titles
# (same as above answer except we use "<" instead of ">")
sqldf("select title_name, price
from titles
where price <
(
select avg(price)
from titles
)
order by title_name
")
title_name price
1 But I Did It Unconciously 12.99
2 Exchange of Platitudes 6.95
3 Just Wait Until After School 10.00
4 Kiss My Boo Boo 13.95
5 Perhaps It's a Glandular Problem 7.99
6 Spontaneous, Not Annoying 12.99
Let’s show that the above queries actually did their jobs by determining the avg price of all titles.
# average price of all titles
sqldf(" select avg(price)
from titles
")
avg(price)
1 18.3875
YOU CANNOT WRITE THE ABOVE QUERIES WITHOUT USING A SUBQUERY!!!! THE FOLLOWING WILL PRODUCE AN ERROR SINCE YOU MAY NOT USE AN AGGREGATE FUNCTION SUCH AS AVG INSIDE A WHERE CLAUSE
sqldf("select title_name, price
from titles
where price > avg(price)
order by title_name
")
Error: misuse of aggregate function avg()
33.14.2 Subquery in HAVING clause
These are very similar to subqueries in the WHERE clause:
Question:
Show the types of books with the 3 highest avg prices. Sort the results in alphabetical order of the type name.
ANSWER:
The following will answer the question but it doesn’t put the rows in alphabetical order of the name of the type.
sqldf("select type, avg(price)
from titles
group by type
order by avg(price) desc
limit 3")
type avg(price)
1 computer 39.95000
2 history 23.97667
3 biography 18.96333
To get just the 3 largest average prices, we had to use “order by avg(price) desc” combined with “limit 3” to get only the first 3 rows. However, that leaves the rows in the order of descending avg(price) and not in alphabetical order of the type as we had been asked to do.
We can get the rows in the right order by using a subquery as follows:
sqldf("select type, avg(price)
from titles
group by type
HAVING avg(price) >=
(select avg(price)
from titles
group by type
order by avg(price) desc
limit 1 offset 2)
order by type")
type avg(price)
1 biography 18.96333
2 computer 39.95000
3 history 23.97667
The following is the output of the subquery if it would have been run on its own.
sqldf("
select avg(price)
from titles
group by type
order by avg(price) desc
limit 1 offset 2
")
avg(price)
1 18.96333
The subquery returns just the 3rd most expensive average price. When this is placed in the having clause of the outer query we get the rows we want without the need to use limit in the outer query. We can now order the outer query by the type column instead of by the price.
33.14.3 Subquery in SELECT, FROM, WHERE, HAVING
Now that you’ve seen the basic idea of a subquery, let’s just reiterate that a subquery can appear in any of the following clauses in a select statement:
SELECT - the subquery must return a single item (i.e. single row single column)
FROM - subqueries in the FROM clause may return many rows and many columns.
The subquery is treated as though it were an actual table.
You need to follow the subquery with a 'name' that will be used
as the name of the pseudo-table that is generated by the subquery.
This is similar to the way you can name columns with aliases.
WHERE or the HAVING - these subqueries may return the following:
(a) a single row, single column , i.e. a single value
e.g. a subquery that appears before or after a + sign or a > sign
or anywhere that you'd expect a single value to appear.
(b) possibly many rows of a single column -
after the IN operator
(c) possibly many rows, many columns -
after the EXISTS operator
33.14.4 subquery in SELECT
Subqueries in the SELECT clause must return a single value (i.e. a single row with a single column).
QUESTION: Write a query that shows the title_name, price and the amount the price is above average for those titles whose price is above average.
ANSWER
sqldf("select title_name, price, price - (select avg(price) from titles) as 'amt_above_avg'
from titles
where price > (select avg(price) from titles)
order by title_name
")
title_name price amt_above_avg
1 1977! 21.99 3.6025
2 200 Years of German Humor 19.95 1.5625
3 Ask Yor System Administrator 39.95 21.5625
4 How About Never? 19.95 1.5625
5 I Blame My Mother 23.95 5.5625
6 What Are The Civilian Applications? 29.99 11.6025
Note that the subquery was required since you shouldn’t mix aggregate functions, e.g. avg(price), in the select statement with non-aggregated columns (e.g. title_name). If you do, then the avg function implies that you will get the average of sevaral prices (i.e. the prices from several rows) being displayed in a single row of output. Howver, the non-aggregated columns such as title_name cannot be summarized into a single row, so the implication is that there should be several rows of output.
In ANSI SQL this is an error. However, the SQLite flavor of SQL that we are using allows this but only returns ONE row of data. This is VERY misleading. For example, with a subquery, using sqlite we get the following:
sqldf("select title_name, price, price - avg(price) as 'amt_above_avg'
from titles
where price > (select avg(price) from titles)
order by title_name
")
title_name price amt_above_avg
1 1977! 21.99 -3.973333
Which seems to imply that there is only one row for a book whose price is above average. However, as we saw in the correct answer shown above, there are actually six such book in our titles table.
Bottom line is that you need to use the version above that has the subquery to answer this question correctly.
33.14.5 You may have more than one subquery
QUESTION: Same as above but also show the average price of all books as a column in the results.
ANSWER
sqldf("select title_name,
price,
(select avg(price) from titles) as avgPriceOfAllBooks,
price - (select avg(price) from titles) as 'amt_above_avg'
from titles
where price > (select avg(price) from titles)
order by title_name
")
title_name price avgPriceOfAllBooks amt_above_avg
1 1977! 21.99 18.3875 3.6025
2 200 Years of German Humor 19.95 18.3875 1.5625
3 Ask Yor System Administrator 39.95 18.3875 21.5625
4 How About Never? 19.95 18.3875 1.5625
5 I Blame My Mother 23.95 18.3875 5.5625
6 What Are The Civilian Applications? 29.99 18.3875 11.6025
When the query above is processed, the subqueries get run and produce the result 18.3875. That value is then plugged into the outer query and the outer query is run. The following is what the outer query would look like after the result of the subqueries are plugged into the outer query. You never have to type this. It is done automatically.
sqldf("select title_name,
price,
18.3875 as avgPriceOfAllBooks,
price - 18.3875 as 'amt_above_avg'
from titles
where price > 18.3875
order by title_name
")
title_name price avgPriceOfAllBooks amt_above_avg
1 1977! 21.99 18.3875 3.6025
2 200 Years of German Humor 19.95 18.3875 1.5625
3 Ask Yor System Administrator 39.95 18.3875 21.5625
4 How About Never? 19.95 18.3875 1.5625
5 I Blame My Mother 23.95 18.3875 5.5625
6 What Are The Civilian Applications? 29.99 18.3875 11.6025
Another example - subquery in SELECT
# GOAL:
#
# Show the avg price of each type of book and how much that differs
# from the overall avg price of all books
# THINKING IT THROUGH:
# start with getting the avg price of each type of book
sqldf("select type, avg(price)
from titles
group by type
order by type
")
type avg(price)
1 biography 18.96333
2 children 11.97500
3 computer 39.95000
4 history 23.97667
5 psychology 9.31000
# THINKING IT THROUGH:
# get overall avg price for all books
sqldf("select avg(price) from titles")
avg(price)
1 18.3875
# FINAL ANSWER:
#
# Put is all together into a single answer with a subquery in the SELECT clause
sqldf("select type, avg(price), avg(price) - (select avg(price) from titles) as 'diff from overall avg'
from titles
group by type
order by type
")
type avg(price) diff from overall avg
1 biography 18.96333 0.5758333
2 children 11.97500 -6.4125000
3 computer 39.95000 21.5625000
4 history 23.97667 5.5891667
5 psychology 9.31000 -9.0775000
33.14.6 subquery after IN can return a single column
#------------------------------------------------------------------------
# Subqueries that return a single column of values
# can appear after the IN operator.
#------------------------------------------------------------------------
# Show authors that live in the same city as at least one publisher
# First let's get all the cities where publishers are located
sqldf("select distinct city
from publishers")
city
1 New York
2 San Francisco
3 Hamburg
4 Berkeley
# You can use the "in" operator in sql to compare a value to a "list" of
# values.
#
# The following will work ... but it forces us to type in the names
# of the publisher's cities. If we change the data in the publishers table,
# this "hard-coded" version of the query may not work correctly anymore.
sqldf("select au_fname, au_lname, city
from authors
where city in ('New York','San Francisco','Hamburg','Berkeley')
order by au_lname, au_fname")
au_fname au_lname city
1 Hallie Hull San Francisco
2 Klee Hull San Francisco
3 Christian Kells New York
# We can replace the list of cities with a subquery that automatically
# figures out that list. This subquery returns a single column
# that can be used in place of the hard-coded data from the previous query.
sqldf("select au_fname, au_lname, city
from authors
where city in
(
select distinct city
from publishers
)
order by au_lname, au_fname")
au_fname au_lname city
1 Hallie Hull San Francisco
2 Klee Hull San Francisco
3 Christian Kells New York
33.14.7 Subqueries in the FROM clause
Subqueries in the FROM clause may return many rows and many columns. The subquery is treated as though it were an actual table. You need to follow the subquery with a ‘name’ that will be used as the name of the pseudo-table that is generated by the subquery. This name is known as an ‘alias’ for the table. This is similar to the way you can name columns with aliases.
QUESTION
Show the title, price and type of each book. Also show the max and min and average prices of books of that category as additional columns in the output.
The results should appear as follows.
title_name price type typeMaxPrice typeMinPrice typeAvgPrice
1 1977! 21.99 history 29.99 19.95 23.98
2 200 Years of German Humor 19.95 history 29.99 19.95 23.98
3 Ask Yor System Administrator 39.95 computer 39.95 39.95 39.95
4 But I Did It Unconciously 12.99 psychology 12.99 6.95 9.31
5 Exchange of Platitudes 6.95 psychology 12.99 6.95 9.31
6 How About Never? 19.95 biography 23.95 12.99 18.96
7 I Blame My Mother 23.95 biography 23.95 12.99 18.96
8 Just Wait Until After School 10.00 children 13.95 10.00 11.97
9 Kiss My Boo Boo 13.95 children 13.95 10.00 11.97
10 Not Without My Fabrerge Egg NA biography 23.95 12.99 18.96
11 Perhaps It's a Glandular Problem 7.99 psychology 12.99 6.95 9.31
12 Spontaneous, Not Annoying 12.99 biography 23.95 12.99 18.96
13 What Are The Civilian Applications? 29.99 history 29.99 19.95 23.98
ANSWER
sqldf("select title_name, price, titles.type, typeMaxPrice, typeMinPrice, typeAvgPrice
from titles JOIN (select type,
max(price) as typeMaxPrice,
min(price) as typeMinPrice,
round(avg(price),2) as typeAvgPrice
from titles
group by type) as summaryData
ON titles.type = summaryData.type
order by title_name")
Explanation:
The subquery produces the result shown below.
sqldf("select type,
max(price) as typeMaxPrice,
min(price) as typeMinPrice,
round(avg(price),2) as typeAvgPrice
from titles
group by type")
type typeMaxPrice typeMinPrice typeAvgPrice
1 biography 23.95 12.99 18.96
2 children 13.95 10.00 11.97
3 computer 39.95 39.95 39.95
4 history 29.99 19.95 23.98
5 psychology 12.99 6.95 9.31
This result is called “summaryData” and is treated as a new table. This is not really a table in the database but can be used in the query as though there were such a table in the database.
This new “summaryData table” contains just one row for each type. We can therefore consider the “type” to be the primary key of this new “summaryData table. We then to JOIN the titles table with this new”summaryData table” by thinking of the summaryData.type column as the “primary key” of the summaryData table and the title.type column as a foreign key in the titles table that references the type column in the summaryData table.
The result is that we can mix non-aggregated data (e.g. title_name column) with aggregate data (e.g. max, min and avg of prices for each type) in the output of the query.
33.16 Table aliases with subqueries
##################################################################
# USING TABLE ALIASES FOR CORRELATED SUBQUERIES THAT USE THE SAME TABLES
# IN THE INNER AND OUTER QUERIES
#
# In correlated subqueries if the inner query refers to the same
# table(s) as the outer query - you must use a table alias.
##################################################################
# Show the books whose length in pages is longer than the average
# length of books OF THE SAME TYPE
#
# You need to use a correlated subquery here.
#
# You need to use a subquery to compare the length of a particular
# book (which is a non-aggregated value) to the avg(length)
# (which is an aggregated value).
#
# To be more specific, you must use a "correlated" subquery
# since each title will have a potentially differnt type. Therefore
# the average length will be potentially different for books
# of different types and sql will need to run the subquery once
# for each row in the titles table.
#
# To be even more specific ... the inner query and the
# outer query refer to the SAME table, i.e. the titles table.
# AND the inner query must refer to information that is taken from
# the outer query for that table. Therefore you must somehow make
# it clear in the inner query which information is from the outer-query-title-table
# and which information is from the version of the title table that is
# in the inner query's from clause.
# This is done by using "alias" names for the outer query's copy of the titles table
# and a different "alias" for the inner query's copy of the titles table as shown below.
sqldf("
select title_name, type, pages
from titles as outerTitles
where pages >
(
select avg(pages)
from titles as innerTitles
where innerTitles.type = outerTitles.type
)
order by type, pages desc
")
title_name type pages
1 Spontaneous, Not Annoying biography 507
2 How About Never? biography 473
3 Just Wait Until After School children 86
4 What Are The Civilian Applications? history 802
5 Perhaps It's a Glandular Problem psychology 826
# We can prove to ourselves that this worked by showing the average
# number of pages for each type of title
sqldf("select type, avg(pages)
from titles
group by type
order by type")
type avg(pages)
1 biography 437.6667
2 children 54.0000
3 computer 1226.0000
4 history 307.6667
5 psychology 512.3333
# We can see those books that are below average in length for their
# type of book by simply changing the > to a < in
# the query
sqldf("
select title_name, type, pages
from titles as outerTitles
where pages <
(
select avg(pages)
from titles as innerTitles
where innerTitles.type = outerTitles.type
)
order by type, pages desc
")
title_name type pages
1 I Blame My Mother biography 333
2 Kiss My Boo Boo children 22
3 1977! history 107
4 200 Years of German Humor history 14
5 But I Did It Unconciously psychology 510
6 Exchange of Platitudes psychology 201
33.17 EXISTS (some subquery)
#-------------------------------------------------------------------------------
# Using "EXISTS" with correlated subqueries that may or may not return any rows.
#
# The EXISTS keyword can be used in WHERE and in HAVING clauses.
#
# e.g. WHERE EXISTS (SOME_SUBQUERY)
#
# HAVING EXISTS (SOME_SUBQUERY)
#
# The EXISTS keyword is always followed by a subquery.
#
# If the subquery returns any rows then EXISTS (subquery) evaluates to TRUE.
# If the subquery returns zero rows then EXISTS (subquery) evaluates to FALSE.
#-------------------------------------------------------------------------------
# "EXISTS ( some subquery )" is used inside a where or having clause.
# The row from the outer query is included in the results if the subquery
# contains ANY rows. The actual data resturned by the subqeury is not significant.
# The only significance is if the subquery returns SOMETHING or returns NOTHING.
# EXAMPLE:
# Show authors that live in the same city as at least one publisher.
#
# [NOTE: above we saw a different way to do this by using "IN (subquery)" ]
sqldf("select au_fname, au_lname, authors.city
from authors
where EXISTS
(select *
from publishers
where publishers.city = authors.city)
order by au_lname, au_fname
")
au_fname au_lname city
1 Hallie Hull San Francisco
2 Klee Hull San Francisco
3 Christian Kells New York
# You know that the subquery is a CORRELATED subquery because it
# cannot be run by itself (as shown below). This is because
# the query refers to the the authors table but does not include
# the authors table in the from clause.
sqldf( "select *
from publishers
where publishers.city = authors.city")
Error: no such column: authors.city
33.18 MULTIPLE WAYS OF ACCOMPLISHING THE SAME THING
#############################################################.
# MULTIPLE WAYS OF ACCOMPLISHING THE SAME THING
#
# There are often multiple different ways you can write queries
# that result in the same output. Different people will approach
# the same problem differently.
#
# When faced with different approaches, you should try if possible to
# avoid correlated subqueries. This is because
# as a "rule of thumb" correlated subqueries will run slower
# than other approaches since the subquery is re-run many times,
# i.e. once for each row in the outer query.
#
# The following is an example of a query that can be accomplished
# in many several different ways.
#############################################################.
#-------------------------------------------------------.
# QUESTION:
#
# Write a query to show the names of the authors
# who did NOT write any psychology books
#-------------------------------------------------------.
#...........................................................
# One answer - using a correlated subquery with NOT EXISTS
#...........................................................
# Authors who did not write any psychology books:
sqldf("select au_fname, au_lname
from authors
where NOT EXISTS
(select *
from title_authors join titles
on title_authors.title_id = titles.title_id
where title_authors.au_id = authors.au_id and
type = 'psychology')
order by au_lname, au_fname
")
au_fname au_lname
1 Sarah Buchman
2 Wendy Heydemark
3 Christian Kells
4 Paddy O'Furniture
# FYI - in case you're curious - this is a query for the authors who DID
# write psychology books
# authors who did write psychology books
sqldf("select distinct au_fname, au_lname
from authors join title_authors on authors.au_id = title_authors.au_id
join titles on title_authors.title_id = titles.title_id
where type = 'psychology'")
au_fname au_lname
1 Hallie Hull
2 Klee Hull
3 Harvey Kellsey
#...........................................................
# Another answer - using EXCEPT
#
# The first query before the except retrieves all the authors names.
# The 2nd query after the EXCEPT retrieves authors who wrote psychology books.
# The EXCEPT removes the rows returned by the 2nd query from the results of
# the first query.
#...........................................................
# Authors who did not write any psychology books:
sqldf("-- get all the authors' names
select au_fname, au_lname
from authors
EXCEPT -- except removes any row that appears in the 2nd query
-- get the authors who wrote psychology books
select distinct au_fname, au_lname
from authors join title_authors on authors.au_id = title_authors.au_id
join titles on titles.title_id = title_authors.title_id
where type = 'psychology'
order by au_lname, au_fname")
au_fname au_lname
1 Sarah Buchman
2 Wendy Heydemark
3 Christian Kells
4 Paddy O'Furniture
#...........................................................
#
# Another answer - using a LEFT JOIN combined with a subquery in a FROM clause.
#
# You can use a subquery in a FROM clause to create a temporary table
# that will be used for the query. The temporary table needs to be
# given an alias.
#
# The example below uses a left join and a subquery in the FROM clause
# to retrieve the authors who did not write psychology books.
# This is yet one more way to accomplish the same results as the
# examples above.
#---------------------------------------------------------------------.
# Yet another way of doing the same thing - by using a LEFT join.
# This also shows an example of using a subquery in a from clause
# to create a temporary table.
sqldf("
select au_fname, au_lname
from authors LEFT JOIN
-- the following subquery creates a temporary table
-- that contains the authors/titles of psychology books
(select *
from title_authors join titles
on title_authors.title_id = titles.title_id
where type = 'psychology') AS psychologyTitles
on authors.au_id = psychologyTitles.au_id
where psychologyTitles.au_id is NULL
order by au_lname, au_fname")
au_fname au_lname
1 Sarah Buchman
2 Wendy Heydemark
3 Christian Kells
4 Paddy O'Furniture
# You can see the contents of the temporary table by running the
# subquery by itself as shown here:
sqldf("select *
from title_authors join titles
on title_authors.title_id = titles.title_id
where type = 'psychology'")
title_id au_id au_order royalty_shares title_id title_name type pub_id pages price sales pubdate
1 T04 A03 1 0.6 T04 But I Did It Unconciously psychology P01 510 12.99 13001 5/31/1999 0:00
2 T04 A04 2 0.4 T04 But I Did It Unconciously psychology P01 510 12.99 13001 5/31/1999 0:00
3 T05 A04 1 1.0 T05 Exchange of Platitudes psychology P01 201 6.95 201440 1/1/2001 0:00
4 T11 A03 2 0.3 T11 Perhaps It's a Glandular Problem psychology P05 826 7.99 94123 11/30/2000 0:00
5 T11 A04 3 0.3 T11 Perhaps It's a Glandular Problem psychology P05 826 7.99 94123 11/30/2000 0:00
6 T11 A06 1 0.4 T11 Perhaps It's a Glandular Problem psychology P05 826 7.99 94123 11/30/2000 0:00
# You can see all the columns from the left join here
sqldf("
select *
from authors LEFT JOIN
-- the following subquery creates a temporary table
-- that contains the authors/titles of psychology books
(select *
from title_authors join titles
on title_authors.title_id = titles.title_id
where type = 'psychology') AS psychologyTitles
on authors.au_id = psychologyTitles.au_id
-- where psychologyTitles.au_id is NULL
order by au_lname, au_fname")
au_id au_fname au_lname phone address city state zip title_id au_id au_order royalty_shares title_id:1 title_name type pub_id pages price sales pubdate
1 A01 Sarah Buchman 718-496-7223 75 West 205 St Bronx NY 10468 <NA> <NA> NA NA <NA> <NA> <NA> <NA> NA NA NA <NA>
2 A02 Wendy Heydemark 303-986-7020 2922 Baseline Rd Boulder CO 80303 <NA> <NA> NA NA <NA> <NA> <NA> <NA> NA NA NA <NA>
3 A03 Hallie Hull 415-549-4278 3800 Waldo Ave, #14F San Francisco CA 94123 T04 A03 1 0.6 T04 But I Did It Unconciously psychology P01 510 12.99 13001 5/31/1999 0:00
4 A03 Hallie Hull 415-549-4278 3800 Waldo Ave, #14F San Francisco CA 94123 T11 A03 2 0.3 T11 Perhaps It's a Glandular Problem psychology P05 826 7.99 94123 11/30/2000 0:00
5 A04 Klee Hull 415-549-4278 3800 Waldo Ave, #14F San Francisco CA 94123 T04 A04 2 0.4 T04 But I Did It Unconciously psychology P01 510 12.99 13001 5/31/1999 0:00
6 A04 Klee Hull 415-549-4278 3800 Waldo Ave, #14F San Francisco CA 94123 T05 A04 1 1.0 T05 Exchange of Platitudes psychology P01 201 6.95 201440 1/1/2001 0:00
7 A04 Klee Hull 415-549-4278 3800 Waldo Ave, #14F San Francisco CA 94123 T11 A04 3 0.3 T11 Perhaps It's a Glandular Problem psychology P05 826 7.99 94123 11/30/2000 0:00
8 A05 Christian Kells 212-771-4680 114 Horatio St New York NY 10014 <NA> <NA> NA NA <NA> <NA> <NA> <NA> NA NA NA <NA>
9 A06 Harvey Kellsey 650-836-7128 390 Serra Mall Palo Alto CA 94305 T11 A06 1 0.4 T11 Perhaps It's a Glandular Problem psychology P05 826 7.99 94123 11/30/2000 0:00
10 A07 Paddy O'Furniture 941-925-0752 1442 Main St Sarasota FL 34236 <NA> <NA> NA NA <NA> <NA> <NA> <NA> NA NA NA <NA>
33.19 case - like an if for SQL
######################################################################.
# The case operator can appear in a SELECT clause to create a custom column.
# It works similar to IF/ELSE/ELSE IF constructs that appear in other
# languages.
#
# The example below should be easy to understand.
#
# If there is no ELSE part and no conditions are true, it returns NULL.
#
# SELECT ... SOME COLUMNS ... ,
#
# case when LOGICAL EXPRESSION then RESULT1
# when LOGICAL EXPRESSION then RESULT2
# when LOGICAL EXPRESSION then RESULT3
# ... etc
# else RESULT_IF_NO_LOGICAL_EXPRESSION_WAS_TRUE
# end as COLUMN_NAME,
#
# ... SOME OTHER COLUMNS ...
#
# FROM ... etc
######################################################################.
# EXAMPLE - this shows the lengths and prices of books in categories
# rather than actual numbers.
sqldf("select title_name,
case
when pages is null then 'unknown'
when pages < 100 then 'short'
when pages < 500 then 'medium'
when pages < 1000 then 'long'
else 'crazy long'
end as length,
pages,
case
when price < 25 then 'cheap'
when price is null then 'unknown'
when price < 30 then 'midrange'
else 'expensive'
end as afforability,
price
from titles
order by pages, title_name")
title_name length pages afforability price
1 Not Without My Fabrerge Egg unknown NA unknown NA
2 200 Years of German Humor short 14 cheap 19.95
3 Kiss My Boo Boo short 22 cheap 13.95
4 Just Wait Until After School short 86 cheap 10.00
5 1977! medium 107 cheap 21.99
6 Exchange of Platitudes medium 201 cheap 6.95
7 I Blame My Mother medium 333 cheap 23.95
8 How About Never? medium 473 cheap 19.95
9 Spontaneous, Not Annoying long 507 cheap 12.99
10 But I Did It Unconciously long 510 cheap 12.99
11 What Are The Civilian Applications? long 802 midrange 29.99
12 Perhaps It's a Glandular Problem long 826 cheap 7.99
13 Ask Yor System Administrator crazy long 1226 expensive 39.95
# EXAMPLE:
#
# NOTE: Even if you don't understand this description the results below
# should be pretty easy to understand.
#
# DESCRIPTION:
# Show a list of all the authors.
#
# Authors who wrote at least one psychology
# book should appear on a row with the word psycology in the column
# named psycology_or_other.
#
# Authors who wrote non-psychology books should appear in a row with the
# word "other" in the psychology_or_other column.
#
# Authors who wrote both psychology and non-psychology books should
# appear in two different rows. One for pyschology and one for other.
sqldf("select distinct
case when type = 'psychology' then 'psychology'
else 'other'
end as pyschology_or_other,
au_lname, au_fname
from authors join title_authors on authors.au_id = title_authors.au_id
join titles on titles.title_id = title_authors.title_id
order by pyschology_or_other, au_lname, au_fname")
pyschology_or_other au_lname au_fname
1 other Buchman Sarah
2 other Heydemark Wendy
3 other Hull Klee
4 other Kells Christian
5 other Kellsey Harvey
6 psychology Hull Hallie
7 psychology Hull Klee
8 psychology Kellsey Harvey
33.20 self joins
######################################################
# Self joins
######################################################
# A "self join" is used when you want to compare two different rows
# FROM THE SAME TABLE. To do so, the from clause uses two different
# copies of the same table in a join.
# The best way to understand this is with an example:
#--------------------------------------------------------------------------
# find the names of authors who share the same address as another author
#--------------------------------------------------------------------------
# We want to compare two different rows from the authors table to see if
# they are from the same address.
# Look at the where clause ... why do you think it uses a ">" sign?
sqldf("
select authors1.au_fname, authors1.au_lname, authors2.au_fname, authors2.au_lname, authors1.address
from authors authors1 join authors authors2 on authors1.address = authors2.address
where authors1.au_id > authors2.au_id ")
au_fname au_lname au_fname au_lname address
1 Klee Hull Hallie Hull 3800 Waldo Ave, #14F
# let's analyze what's really going on by showing the CROSS JOIN that happens
# internally as the above select statement is being processed
sqldf("select authors1.au_id, authors1.au_fname, authors1.au_lname, authors1.address,
authors2.au_id, authors2.au_fname, authors2.au_lname, authors2.address
from authors authors1 CROSS JOIN authors authors2
order by authors1.au_id, authors2.au_id")
au_id au_fname au_lname address au_id au_fname au_lname address
1 A01 Sarah Buchman 75 West 205 St A01 Sarah Buchman 75 West 205 St
2 A01 Sarah Buchman 75 West 205 St A02 Wendy Heydemark 2922 Baseline Rd
3 A01 Sarah Buchman 75 West 205 St A03 Hallie Hull 3800 Waldo Ave, #14F
4 A01 Sarah Buchman 75 West 205 St A04 Klee Hull 3800 Waldo Ave, #14F
5 A01 Sarah Buchman 75 West 205 St A05 Christian Kells 114 Horatio St
6 A01 Sarah Buchman 75 West 205 St A06 Harvey Kellsey 390 Serra Mall
7 A01 Sarah Buchman 75 West 205 St A07 Paddy O'Furniture 1442 Main St
8 A02 Wendy Heydemark 2922 Baseline Rd A01 Sarah Buchman 75 West 205 St
9 A02 Wendy Heydemark 2922 Baseline Rd A02 Wendy Heydemark 2922 Baseline Rd
10 A02 Wendy Heydemark 2922 Baseline Rd A03 Hallie Hull 3800 Waldo Ave, #14F
11 A02 Wendy Heydemark 2922 Baseline Rd A04 Klee Hull 3800 Waldo Ave, #14F
12 A02 Wendy Heydemark 2922 Baseline Rd A05 Christian Kells 114 Horatio St
13 A02 Wendy Heydemark 2922 Baseline Rd A06 Harvey Kellsey 390 Serra Mall
14 A02 Wendy Heydemark 2922 Baseline Rd A07 Paddy O'Furniture 1442 Main St
15 A03 Hallie Hull 3800 Waldo Ave, #14F A01 Sarah Buchman 75 West 205 St
16 A03 Hallie Hull 3800 Waldo Ave, #14F A02 Wendy Heydemark 2922 Baseline Rd
17 A03 Hallie Hull 3800 Waldo Ave, #14F A03 Hallie Hull 3800 Waldo Ave, #14F
18 A03 Hallie Hull 3800 Waldo Ave, #14F A04 Klee Hull 3800 Waldo Ave, #14F
19 A03 Hallie Hull 3800 Waldo Ave, #14F A05 Christian Kells 114 Horatio St
20 A03 Hallie Hull 3800 Waldo Ave, #14F A06 Harvey Kellsey 390 Serra Mall
21 A03 Hallie Hull 3800 Waldo Ave, #14F A07 Paddy O'Furniture 1442 Main St
22 A04 Klee Hull 3800 Waldo Ave, #14F A01 Sarah Buchman 75 West 205 St
23 A04 Klee Hull 3800 Waldo Ave, #14F A02 Wendy Heydemark 2922 Baseline Rd
24 A04 Klee Hull 3800 Waldo Ave, #14F A03 Hallie Hull 3800 Waldo Ave, #14F
25 A04 Klee Hull 3800 Waldo Ave, #14F A04 Klee Hull 3800 Waldo Ave, #14F
26 A04 Klee Hull 3800 Waldo Ave, #14F A05 Christian Kells 114 Horatio St
27 A04 Klee Hull 3800 Waldo Ave, #14F A06 Harvey Kellsey 390 Serra Mall
28 A04 Klee Hull 3800 Waldo Ave, #14F A07 Paddy O'Furniture 1442 Main St
29 A05 Christian Kells 114 Horatio St A01 Sarah Buchman 75 West 205 St
30 A05 Christian Kells 114 Horatio St A02 Wendy Heydemark 2922 Baseline Rd
31 A05 Christian Kells 114 Horatio St A03 Hallie Hull 3800 Waldo Ave, #14F
32 A05 Christian Kells 114 Horatio St A04 Klee Hull 3800 Waldo Ave, #14F
33 A05 Christian Kells 114 Horatio St A05 Christian Kells 114 Horatio St
34 A05 Christian Kells 114 Horatio St A06 Harvey Kellsey 390 Serra Mall
35 A05 Christian Kells 114 Horatio St A07 Paddy O'Furniture 1442 Main St
36 A06 Harvey Kellsey 390 Serra Mall A01 Sarah Buchman 75 West 205 St
37 A06 Harvey Kellsey 390 Serra Mall A02 Wendy Heydemark 2922 Baseline Rd
38 A06 Harvey Kellsey 390 Serra Mall A03 Hallie Hull 3800 Waldo Ave, #14F
39 A06 Harvey Kellsey 390 Serra Mall A04 Klee Hull 3800 Waldo Ave, #14F
40 A06 Harvey Kellsey 390 Serra Mall A05 Christian Kells 114 Horatio St
41 A06 Harvey Kellsey 390 Serra Mall A06 Harvey Kellsey 390 Serra Mall
42 A06 Harvey Kellsey 390 Serra Mall A07 Paddy O'Furniture 1442 Main St
43 A07 Paddy O'Furniture 1442 Main St A01 Sarah Buchman 75 West 205 St
44 A07 Paddy O'Furniture 1442 Main St A02 Wendy Heydemark 2922 Baseline Rd
45 A07 Paddy O'Furniture 1442 Main St A03 Hallie Hull 3800 Waldo Ave, #14F
46 A07 Paddy O'Furniture 1442 Main St A04 Klee Hull 3800 Waldo Ave, #14F
47 A07 Paddy O'Furniture 1442 Main St A05 Christian Kells 114 Horatio St
48 A07 Paddy O'Furniture 1442 Main St A06 Harvey Kellsey 390 Serra Mall
49 A07 Paddy O'Furniture 1442 Main St A07 Paddy O'Furniture 1442 Main St
# If you take out the where clause, you will get extra rows
sqldf("
select authors1.au_fname, authors1.au_lname, authors2.au_fname, authors2.au_lname, authors1.address
from authors authors1 join authors authors2 on authors1.address = authors2.address
")
au_fname au_lname au_fname au_lname address
1 Sarah Buchman Sarah Buchman 75 West 205 St
2 Wendy Heydemark Wendy Heydemark 2922 Baseline Rd
3 Hallie Hull Hallie Hull 3800 Waldo Ave, #14F
4 Hallie Hull Klee Hull 3800 Waldo Ave, #14F
5 Klee Hull Hallie Hull 3800 Waldo Ave, #14F
6 Klee Hull Klee Hull 3800 Waldo Ave, #14F
7 Christian Kells Christian Kells 114 Horatio St
8 Harvey Kellsey Harvey Kellsey 390 Serra Mall
9 Paddy O'Furniture Paddy O'Furniture 1442 Main St
# To understand why you get extra rows without the where take a look at the output
# of the cross join of the authors table with itself. It is this cross join
# that is then filtered by the on clause and by the where clause.
sqldf("select authors1.au_fname, authors1.au_lname , authors1.address, authors2.au_fname, authors2.au_lname, authors2.address
from authors as authors1 cross join authors as authors2
order by authors1.au_lname, authors1.au_fname, authors2.au_lname, authors2.au_fname")
au_fname au_lname address au_fname au_lname address
1 Sarah Buchman 75 West 205 St Sarah Buchman 75 West 205 St
2 Sarah Buchman 75 West 205 St Wendy Heydemark 2922 Baseline Rd
3 Sarah Buchman 75 West 205 St Hallie Hull 3800 Waldo Ave, #14F
4 Sarah Buchman 75 West 205 St Klee Hull 3800 Waldo Ave, #14F
5 Sarah Buchman 75 West 205 St Christian Kells 114 Horatio St
6 Sarah Buchman 75 West 205 St Harvey Kellsey 390 Serra Mall
7 Sarah Buchman 75 West 205 St Paddy O'Furniture 1442 Main St
8 Wendy Heydemark 2922 Baseline Rd Sarah Buchman 75 West 205 St
9 Wendy Heydemark 2922 Baseline Rd Wendy Heydemark 2922 Baseline Rd
10 Wendy Heydemark 2922 Baseline Rd Hallie Hull 3800 Waldo Ave, #14F
11 Wendy Heydemark 2922 Baseline Rd Klee Hull 3800 Waldo Ave, #14F
12 Wendy Heydemark 2922 Baseline Rd Christian Kells 114 Horatio St
13 Wendy Heydemark 2922 Baseline Rd Harvey Kellsey 390 Serra Mall
14 Wendy Heydemark 2922 Baseline Rd Paddy O'Furniture 1442 Main St
15 Hallie Hull 3800 Waldo Ave, #14F Sarah Buchman 75 West 205 St
16 Hallie Hull 3800 Waldo Ave, #14F Wendy Heydemark 2922 Baseline Rd
17 Hallie Hull 3800 Waldo Ave, #14F Hallie Hull 3800 Waldo Ave, #14F
18 Hallie Hull 3800 Waldo Ave, #14F Klee Hull 3800 Waldo Ave, #14F
19 Hallie Hull 3800 Waldo Ave, #14F Christian Kells 114 Horatio St
20 Hallie Hull 3800 Waldo Ave, #14F Harvey Kellsey 390 Serra Mall
21 Hallie Hull 3800 Waldo Ave, #14F Paddy O'Furniture 1442 Main St
22 Klee Hull 3800 Waldo Ave, #14F Sarah Buchman 75 West 205 St
23 Klee Hull 3800 Waldo Ave, #14F Wendy Heydemark 2922 Baseline Rd
24 Klee Hull 3800 Waldo Ave, #14F Hallie Hull 3800 Waldo Ave, #14F
25 Klee Hull 3800 Waldo Ave, #14F Klee Hull 3800 Waldo Ave, #14F
26 Klee Hull 3800 Waldo Ave, #14F Christian Kells 114 Horatio St
27 Klee Hull 3800 Waldo Ave, #14F Harvey Kellsey 390 Serra Mall
28 Klee Hull 3800 Waldo Ave, #14F Paddy O'Furniture 1442 Main St
29 Christian Kells 114 Horatio St Sarah Buchman 75 West 205 St
30 Christian Kells 114 Horatio St Wendy Heydemark 2922 Baseline Rd
31 Christian Kells 114 Horatio St Hallie Hull 3800 Waldo Ave, #14F
32 Christian Kells 114 Horatio St Klee Hull 3800 Waldo Ave, #14F
33 Christian Kells 114 Horatio St Christian Kells 114 Horatio St
34 Christian Kells 114 Horatio St Harvey Kellsey 390 Serra Mall
35 Christian Kells 114 Horatio St Paddy O'Furniture 1442 Main St
36 Harvey Kellsey 390 Serra Mall Sarah Buchman 75 West 205 St
37 Harvey Kellsey 390 Serra Mall Wendy Heydemark 2922 Baseline Rd
38 Harvey Kellsey 390 Serra Mall Hallie Hull 3800 Waldo Ave, #14F
39 Harvey Kellsey 390 Serra Mall Klee Hull 3800 Waldo Ave, #14F
40 Harvey Kellsey 390 Serra Mall Christian Kells 114 Horatio St
41 Harvey Kellsey 390 Serra Mall Harvey Kellsey 390 Serra Mall
42 Harvey Kellsey 390 Serra Mall Paddy O'Furniture 1442 Main St
43 Paddy O'Furniture 1442 Main St Sarah Buchman 75 West 205 St
44 Paddy O'Furniture 1442 Main St Wendy Heydemark 2922 Baseline Rd
45 Paddy O'Furniture 1442 Main St Hallie Hull 3800 Waldo Ave, #14F
46 Paddy O'Furniture 1442 Main St Klee Hull 3800 Waldo Ave, #14F
47 Paddy O'Furniture 1442 Main St Christian Kells 114 Horatio St
48 Paddy O'Furniture 1442 Main St Harvey Kellsey 390 Serra Mall
49 Paddy O'Furniture 1442 Main St Paddy O'Furniture 1442 Main St
# You might think that we'd solve the problem if we used != instead of >
# However, this is still a problem. Can you figure out why?
sqldf("
select authors1.au_fname, authors1.au_lname, authors2.au_fname, authors2.au_lname, authors1.address
from authors authors1 join authors authors2 on authors1.address = authors2.address
where authors1.au_id != authors2.au_id ")
au_fname au_lname au_fname au_lname address
1 Hallie Hull Klee Hull 3800 Waldo Ave, #14F
2 Klee Hull Hallie Hull 3800 Waldo Ave, #14F
# if we want to check that multiple columns are equal we can do that too
sqldf("
select authors1.au_fname, authors1.au_lname, authors2.au_fname, authors2.au_lname, authors1.address
from authors authors1 join authors authors2
on
authors1.address = authors2.address and
authors1.city = authors2.city and
authors1.state = authors2.state and
authors1.zip = authors2.zip
where authors1.au_id > authors2.au_id ")
au_fname au_lname au_fname au_lname address
1 Klee Hull Hallie Hull 3800 Waldo Ave, #14F
# If we used a subquery for this type of question
# we could not see both authors being displayed on the same row
33.21 self joins - another example
#------------------------------------------------------------------------
# ANOTHER SELF JOIN EXAMPLE
#------------------------------------------------------------------------
# Organization management hierarchy
# - see the file "orgChart_large.png"
#------------------------------------------------------------------------
= data.frame(
employees empid = c( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
name = c("ann", "floyd", "randall", "albert","shawn","tyrone", "calvin", "harold", "tanya", "jorge", "claire" ),
lastname = c("sanders","flake","rodgers", "aames","stevens","tex", "coolidge", "hughs", "tamashevsky","jones", "coolidge" ),
salary = c(200, 50, 75, 90, 150, 250, 75, 190, 150, 175, 110 ),
mgr = c(NA, 1, 1, 1, 2, 3, 3, 2, 7, 2, 7)
)
employees
empid name lastname salary mgr
1 1 ann sanders 200 NA
2 2 floyd flake 50 1
3 3 randall rodgers 75 1
4 4 albert aames 90 1
5 5 shawn stevens 150 2
6 6 tyrone tex 250 3
7 7 calvin coolidge 75 3
8 8 harold hughs 190 2
9 9 tanya tamashevsky 150 7
10 10 jorge jones 175 2
11 11 claire coolidge 110 7
# write a query that shows SO_AND_SO is managed by SO_AND_SO for each person
sqldf("select emp.name || ' is managed by ' || manager.name
from employees AS emp join employees AS manager on emp.mgr = manager.empid
order by emp.name
")
emp.name || ' is managed by ' || manager.name
1 albert is managed by ann
2 calvin is managed by randall
3 claire is managed by calvin
4 floyd is managed by ann
5 harold is managed by floyd
6 jorge is managed by floyd
7 randall is managed by ann
8 shawn is managed by floyd
9 tanya is managed by calvin
10 tyrone is managed by randall
# To make it a little easier to read we could use the
# aliasses emps and mgrs instead of emp1 and emp2
sqldf("select emps.name || ' is managed by ' || mgrs.name
from employees emps join employees mgrs on emps.mgr = mgrs.empid
order by emps.name
")
emps.name || ' is managed by ' || mgrs.name
1 albert is managed by ann
2 calvin is managed by randall
3 claire is managed by calvin
4 floyd is managed by ann
5 harold is managed by floyd
6 jorge is managed by floyd
7 randall is managed by ann
8 shawn is managed by floyd
9 tanya is managed by calvin
10 tyrone is managed by randall
sqldf("select emps.name || ' is managed by ' || mgrs.name
from employees as emps join employees as mgrs on emps.mgr = mgrs.empid
order by emps.name
")
emps.name || ' is managed by ' || mgrs.name
1 albert is managed by ann
2 calvin is managed by randall
3 claire is managed by calvin
4 floyd is managed by ann
5 harold is managed by floyd
6 jorge is managed by floyd
7 randall is managed by ann
8 shawn is managed by floyd
9 tanya is managed by calvin
10 tyrone is managed by randall
sqldf("select publishers.pub_name, titles.title_name
from publishers join titles on publishers.pub_id = titles.pub_id
where publishers.pub_name like 'A%'")
pub_name title_name
1 Abatis Publishers 1977!
2 Abatis Publishers But I Did It Unconciously
3 Abatis Publishers Exchange of Platitudes
4 Abatis Publishers How About Never?
5 Abatis Publishers Just Wait Until After School
6 Abatis Publishers Kiss My Boo Boo
7 AAA Publishing Not Without My Fabrerge Egg
8 AAA Publishing Perhaps It's a Glandular Problem
9 AAA Publishing Spontaneous, Not Annoying
sqldf("select publishers.pub_name, titles.title_name
from publishers join titles on publishers.pub_id = titles.pub_id
and publishers.pub_name like 'A%'")
pub_name title_name
1 Abatis Publishers 1977!
2 Abatis Publishers But I Did It Unconciously
3 Abatis Publishers Exchange of Platitudes
4 Abatis Publishers How About Never?
5 Abatis Publishers Just Wait Until After School
6 Abatis Publishers Kiss My Boo Boo
7 AAA Publishing Not Without My Fabrerge Egg
8 AAA Publishing Perhaps It's a Glandular Problem
9 AAA Publishing Spontaneous, Not Annoying
sqldf("select publishers.pub_name, titles.title_name
from publishers, titles
where publishers.pub_id = titles.pub_id
and publishers.pub_name like 'A%'")
pub_name title_name
1 Abatis Publishers 1977!
2 Abatis Publishers But I Did It Unconciously
3 Abatis Publishers Exchange of Platitudes
4 Abatis Publishers How About Never?
5 Abatis Publishers Just Wait Until After School
6 Abatis Publishers Kiss My Boo Boo
7 AAA Publishing Not Without My Fabrerge Egg
8 AAA Publishing Perhaps It's a Glandular Problem
9 AAA Publishing Spontaneous, Not Annoying
# It might help to understand what's going on by looking
# at the cross join of the two tables.
sqldf("select emps.*, mgrs.*
from employees as emps CROSS JOIN employees as mgrs
order by emps.empid, mgrs.empid")
empid name lastname salary mgr empid name lastname salary mgr
1 1 ann sanders 200 NA 1 ann sanders 200 NA
2 1 ann sanders 200 NA 2 floyd flake 50 1
3 1 ann sanders 200 NA 3 randall rodgers 75 1
4 1 ann sanders 200 NA 4 albert aames 90 1
5 1 ann sanders 200 NA 5 shawn stevens 150 2
6 1 ann sanders 200 NA 6 tyrone tex 250 3
7 1 ann sanders 200 NA 7 calvin coolidge 75 3
8 1 ann sanders 200 NA 8 harold hughs 190 2
9 1 ann sanders 200 NA 9 tanya tamashevsky 150 7
10 1 ann sanders 200 NA 10 jorge jones 175 2
11 1 ann sanders 200 NA 11 claire coolidge 110 7
12 2 floyd flake 50 1 1 ann sanders 200 NA
13 2 floyd flake 50 1 2 floyd flake 50 1
14 2 floyd flake 50 1 3 randall rodgers 75 1
15 2 floyd flake 50 1 4 albert aames 90 1
16 2 floyd flake 50 1 5 shawn stevens 150 2
17 2 floyd flake 50 1 6 tyrone tex 250 3
18 2 floyd flake 50 1 7 calvin coolidge 75 3
19 2 floyd flake 50 1 8 harold hughs 190 2
20 2 floyd flake 50 1 9 tanya tamashevsky 150 7
21 2 floyd flake 50 1 10 jorge jones 175 2
22 2 floyd flake 50 1 11 claire coolidge 110 7
23 3 randall rodgers 75 1 1 ann sanders 200 NA
24 3 randall rodgers 75 1 2 floyd flake 50 1
25 3 randall rodgers 75 1 3 randall rodgers 75 1
26 3 randall rodgers 75 1 4 albert aames 90 1
27 3 randall rodgers 75 1 5 shawn stevens 150 2
28 3 randall rodgers 75 1 6 tyrone tex 250 3
29 3 randall rodgers 75 1 7 calvin coolidge 75 3
30 3 randall rodgers 75 1 8 harold hughs 190 2
31 3 randall rodgers 75 1 9 tanya tamashevsky 150 7
32 3 randall rodgers 75 1 10 jorge jones 175 2
33 3 randall rodgers 75 1 11 claire coolidge 110 7
34 4 albert aames 90 1 1 ann sanders 200 NA
35 4 albert aames 90 1 2 floyd flake 50 1
36 4 albert aames 90 1 3 randall rodgers 75 1
37 4 albert aames 90 1 4 albert aames 90 1
38 4 albert aames 90 1 5 shawn stevens 150 2
39 4 albert aames 90 1 6 tyrone tex 250 3
40 4 albert aames 90 1 7 calvin coolidge 75 3
41 4 albert aames 90 1 8 harold hughs 190 2
42 4 albert aames 90 1 9 tanya tamashevsky 150 7
43 4 albert aames 90 1 10 jorge jones 175 2
44 4 albert aames 90 1 11 claire coolidge 110 7
45 5 shawn stevens 150 2 1 ann sanders 200 NA
46 5 shawn stevens 150 2 2 floyd flake 50 1
47 5 shawn stevens 150 2 3 randall rodgers 75 1
48 5 shawn stevens 150 2 4 albert aames 90 1
49 5 shawn stevens 150 2 5 shawn stevens 150 2
50 5 shawn stevens 150 2 6 tyrone tex 250 3
51 5 shawn stevens 150 2 7 calvin coolidge 75 3
52 5 shawn stevens 150 2 8 harold hughs 190 2
53 5 shawn stevens 150 2 9 tanya tamashevsky 150 7
54 5 shawn stevens 150 2 10 jorge jones 175 2
55 5 shawn stevens 150 2 11 claire coolidge 110 7
56 6 tyrone tex 250 3 1 ann sanders 200 NA
57 6 tyrone tex 250 3 2 floyd flake 50 1
58 6 tyrone tex 250 3 3 randall rodgers 75 1
59 6 tyrone tex 250 3 4 albert aames 90 1
60 6 tyrone tex 250 3 5 shawn stevens 150 2
61 6 tyrone tex 250 3 6 tyrone tex 250 3
62 6 tyrone tex 250 3 7 calvin coolidge 75 3
63 6 tyrone tex 250 3 8 harold hughs 190 2
64 6 tyrone tex 250 3 9 tanya tamashevsky 150 7
65 6 tyrone tex 250 3 10 jorge jones 175 2
66 6 tyrone tex 250 3 11 claire coolidge 110 7
67 7 calvin coolidge 75 3 1 ann sanders 200 NA
68 7 calvin coolidge 75 3 2 floyd flake 50 1
69 7 calvin coolidge 75 3 3 randall rodgers 75 1
70 7 calvin coolidge 75 3 4 albert aames 90 1
71 7 calvin coolidge 75 3 5 shawn stevens 150 2
72 7 calvin coolidge 75 3 6 tyrone tex 250 3
73 7 calvin coolidge 75 3 7 calvin coolidge 75 3
74 7 calvin coolidge 75 3 8 harold hughs 190 2
75 7 calvin coolidge 75 3 9 tanya tamashevsky 150 7
76 7 calvin coolidge 75 3 10 jorge jones 175 2
77 7 calvin coolidge 75 3 11 claire coolidge 110 7
78 8 harold hughs 190 2 1 ann sanders 200 NA
79 8 harold hughs 190 2 2 floyd flake 50 1
80 8 harold hughs 190 2 3 randall rodgers 75 1
81 8 harold hughs 190 2 4 albert aames 90 1
82 8 harold hughs 190 2 5 shawn stevens 150 2
83 8 harold hughs 190 2 6 tyrone tex 250 3
84 8 harold hughs 190 2 7 calvin coolidge 75 3
85 8 harold hughs 190 2 8 harold hughs 190 2
86 8 harold hughs 190 2 9 tanya tamashevsky 150 7
87 8 harold hughs 190 2 10 jorge jones 175 2
88 8 harold hughs 190 2 11 claire coolidge 110 7
89 9 tanya tamashevsky 150 7 1 ann sanders 200 NA
90 9 tanya tamashevsky 150 7 2 floyd flake 50 1
91 9 tanya tamashevsky 150 7 3 randall rodgers 75 1
92 9 tanya tamashevsky 150 7 4 albert aames 90 1
93 9 tanya tamashevsky 150 7 5 shawn stevens 150 2
94 9 tanya tamashevsky 150 7 6 tyrone tex 250 3
95 9 tanya tamashevsky 150 7 7 calvin coolidge 75 3
96 9 tanya tamashevsky 150 7 8 harold hughs 190 2
97 9 tanya tamashevsky 150 7 9 tanya tamashevsky 150 7
98 9 tanya tamashevsky 150 7 10 jorge jones 175 2
99 9 tanya tamashevsky 150 7 11 claire coolidge 110 7
100 10 jorge jones 175 2 1 ann sanders 200 NA
101 10 jorge jones 175 2 2 floyd flake 50 1
102 10 jorge jones 175 2 3 randall rodgers 75 1
103 10 jorge jones 175 2 4 albert aames 90 1
104 10 jorge jones 175 2 5 shawn stevens 150 2
105 10 jorge jones 175 2 6 tyrone tex 250 3
106 10 jorge jones 175 2 7 calvin coolidge 75 3
107 10 jorge jones 175 2 8 harold hughs 190 2
108 10 jorge jones 175 2 9 tanya tamashevsky 150 7
109 10 jorge jones 175 2 10 jorge jones 175 2
110 10 jorge jones 175 2 11 claire coolidge 110 7
111 11 claire coolidge 110 7 1 ann sanders 200 NA
112 11 claire coolidge 110 7 2 floyd flake 50 1
113 11 claire coolidge 110 7 3 randall rodgers 75 1
114 11 claire coolidge 110 7 4 albert aames 90 1
115 11 claire coolidge 110 7 5 shawn stevens 150 2
116 11 claire coolidge 110 7 6 tyrone tex 250 3
117 11 claire coolidge 110 7 7 calvin coolidge 75 3
118 11 claire coolidge 110 7 8 harold hughs 190 2
119 11 claire coolidge 110 7 9 tanya tamashevsky 150 7
120 11 claire coolidge 110 7 10 jorge jones 175 2
121 11 claire coolidge 110 7 11 claire coolidge 110 7
33.22 Practice (question #rs are from booksDatabase Word document.)
#######################################################################.
# PRACTICE with subqueries, outer joins, unions and other stuff.
# Note that many of the questions shown below come from the
# booksDatabase word document. The numbers for the questions
# are the numbers from that document.
#######################################################################.
33.22.1 Question 35
# Show the title names and number of pages for those books that are
# longer than (ie. more pages than) the average length of all books.
#~~~~~~~~~~~~~~
# ANSWER
#~~~~~~~~~~~~~~
sqldf("
select title_name , pages
from titles
where pages >= (select avg(pages) from titles)
")
title_name pages
1 Ask Yor System Administrator 1226
2 But I Did It Unconciously 510
3 How About Never? 473
4 Perhaps It's a Glandular Problem 826
5 Spontaneous, Not Annoying 507
6 What Are The Civilian Applications? 802
33.22.2 Question 36
# Show each publisher's name and the number of authors who have written books
# for that publisher.
#~~~~~~~~~~~~~~
# ANSWER
#~~~~~~~~~~~~~~
sqldf("
select pub_name, count(DISTINCT authors.au_id) as '# of authors'
from authors join title_authors on authors.au_id = title_authors.au_id
join titles on title_authors.title_id = titles.title_id
join publishers on publishers.pub_id = titles.pub_id
group by publishers.pub_id, pub_name;
")
pub_name # of authors
1 Abatis Publishers 5
2 Core Dump Books 1
3 Schandenfreude Press 3
4 AAA Publishing 4
# EXPLANATION
# The following query shows the first step that would be necessariy to thinking about
# how to find the final answer (shown above). The following shows a separate
# row for each publisher and an author that wrote for that publisher. If you
# order the results by publisher, it is easy to see that you can think of
# each different publisher as a "group". That is what the group by
# does in the answer (see above). The count(DISTINCT authors.au_id)
# gets the number of "distinct" authors for the particular publisher.
# The "distinct" is necessary to ensure that you don't count an author
# twice for the same publisher (e.g. Sarah Buchman wrote two books
# for Schandenfreude Press but should only be counted once as an
# author for Scandenfreude Press)
sqldf("
select pub_name, authors.au_fname, authors.au_lname
from authors join title_authors on authors.au_id = title_authors.au_id
join titles on title_authors.title_id = titles.title_id
join publishers on publishers.pub_id = titles.pub_id
order by pub_name, au_fname, au_lname
")
pub_name au_fname au_lname
1 AAA Publishing Hallie Hull
2 AAA Publishing Harvey Kellsey
3 AAA Publishing Klee Hull
4 AAA Publishing Wendy Heydemark
5 AAA Publishing Wendy Heydemark
6 Abatis Publishers Hallie Hull
7 Abatis Publishers Harvey Kellsey
8 Abatis Publishers Harvey Kellsey
9 Abatis Publishers Klee Hull
10 Abatis Publishers Klee Hull
11 Abatis Publishers Sarah Buchman
12 Abatis Publishers Wendy Heydemark
13 Core Dump Books Christian Kells
14 Schandenfreude Press Klee Hull
15 Schandenfreude Press Sarah Buchman
16 Schandenfreude Press Sarah Buchman
17 Schandenfreude Press Wendy Heydemark
33.22.3 Question 36.5
# Show the pub_name and the number of titles published by that publisher.
#~~~~~~~~~~~~~~
# ANSWER
#~~~~~~~~~~~~~~
sqldf("select pub_name, count(*) as num_titles
from publishers join titles on publishers.pub_id = titles.pub_id
group by publishers.pub_id, pub_name
order by pub_name")
pub_name num_titles
1 AAA Publishing 3
2 Abatis Publishers 6
3 Core Dump Books 1
4 Schandenfreude Press 3
33.22.4 Question 37
# Show the pub_name and the number of titles published by that publisher.
# Only include publishers for whom at least 4 authors have worked.
#
# (this question combines the concepts of the previous two questions)
#~~~~~~~~~~~~~~~~.
# ANSWER
#~~~~~~~~~~~~~~~~.
sqldf("
select pub_name, count(*) as 'Number of Titles'
from publishers as pub1 join titles on pub1.pub_id = titles.pub_id
group by pub1.pub_id, pub_name
having 4 <= ( select count(DISTINCT authors.au_id)
from authors join title_authors on authors.au_id = title_authors.au_id
join titles on title_authors.title_id = titles.title_id
join publishers as pub2 on pub2.pub_id = titles.pub_id
where pub2.pub_id = pub1.pub_id )
order by pub_name
")
pub_name Number of Titles
1 AAA Publishing 3
2 Abatis Publishers 6
# Note - the subquery is a correlated subquery.
# It returns the number of authors who published with a particular publisher.
# The following are the acutal subqueries that are run for each publisher:
# This happens automatically.
# For Abatis - 'P01'
sqldf(" select count(DISTINCT authors.au_id)
from authors join title_authors on authors.au_id = title_authors.au_id
join titles on title_authors.title_id = titles.title_id
join publishers as pub2 on pub2.pub_id = titles.pub_id
where pub2.pub_id = 'P01'")
count(DISTINCT authors.au_id)
1 5
# For Core Dump Books - 'P02'
sqldf(" select count(DISTINCT authors.au_id)
from authors join title_authors on authors.au_id = title_authors.au_id
join titles on title_authors.title_id = titles.title_id
join publishers as pub2 on pub2.pub_id = titles.pub_id
where pub2.pub_id = 'P02'")
count(DISTINCT authors.au_id)
1 1
33.22.5 Question 38
# Show the names of publishers who did not publish any books.
# (I guess they are just getting started in the business ... )
#~~~~~~~~~~~~~~~~.
# ANSWER
#~~~~~~~~~~~~~~~~.
# ANSWER - with a subquery
sqldf("
select pub_name
from publishers as pub1
where not exists
( select *
from publishers as pub2 join titles on pub2.pub_id = titles.pub_id
where pub2.pub_id = pub1.pub_id );
")
pub_name
1 Tneterhooks Press
# ANSWER - with a left join
sqldf("
select pub_name from publishers left join titles on publishers.pub_id = titles.pub_id
where title_id is NULL;
")
pub_name
1 Tneterhooks Press
33.22.6 Question 39(a)
# List the name of each publisher and the total number of books that
# each publisher has published. Sort the results so that the publishers who
# published the most books appear at the top. If two or more publishers
# published the same number of books then they should be listed in alphabetical order.
#
# a. The answer should only include publisher who have published some books.
#~~~~~~~~~~~~~~~~.
# ANSWER
#~~~~~~~~~~~~~~~~.
sqldf("
select pub_name, count(*)
from titles join publishers on publishers.pub_id = titles.pub_id
group by pub_name
order by count(*) desc, pub_name;
")
pub_name count(*)
1 Abatis Publishers 6
2 AAA Publishing 3
3 Schandenfreude Press 3
4 Core Dump Books 1
33.22.7 Question 39(b)
# (2nd part of previous question)
#
# b. This time make sure to also include publishers who have published zero books
#~~~~~~~~~~~~~~~~.
# ANSWER
#~~~~~~~~~~~~~~~~.
sqldf("
select pub_name, count(titles.title_id)
from publishers left join titles on publishers.pub_id = titles.pub_id
group by pub_name
order by count(*) desc, pub_name;
")
pub_name count(titles.title_id)
1 Abatis Publishers 6
2 AAA Publishing 3
3 Schandenfreude Press 3
4 Core Dump Books 1
5 Tneterhooks Press 0
# ANSWER - with a UNION - however the previous answer is shorter and more to the point
sqldf("
select pub_name , 0 as NumTitles
from publishers left join titles on publishers.pub_id = titles.pub_id
where title_id is NULL
UNION
select pub_name, count(*) as NumTitles
from titles join publishers on publishers.pub_id = titles.pub_id
group by pub_name;
")
pub_name NumTitles
1 AAA Publishing 3
2 Abatis Publishers 6
3 Core Dump Books 1
4 Schandenfreude Press 3
5 Tneterhooks Press 0
33.22.8 Question 46
#################.
# QUESTION
#################.
# Show the names of the most prolific authors, i.e. the authors
# who have written the most books. Note that there could be
# several different authors who are "tied" for the most number of books
# written. Sort the names in alphabetical order (last name, first name)
# Also show the au_id and the number of titles that the author wrote.
#~~~~~~~~~~~~~~~~.
# ANSWER
#~~~~~~~~~~~~~~~~.
# This is a rather complex answer.
# This approach uses a subquery within a subquery.
# There may be other approaches too.
# There may be simpler answers too.
#
# To explain we will show the subqueries and what they produce.
# Then we will show the full query that includes the subquery.
#
# The following query is NOT the answer. It is simply a query that
# shows the number of books that were written by each author. This
# will be used as a subquery in the answer shown below.
sqldf("
select authors.au_id, count(*) as numTitles
from authors join title_authors on authors.au_id = title_authors.au_id
group by authors.au_id, au_fname, au_lname
")
au_id numTitles
1 A01 3
2 A02 4
3 A03 2
4 A04 4
5 A05 1
6 A06 3
# We'd like to get the max number of titles that an author wrote.
# This amounts to getting the highest number from the numTitles column
# in the previous query. We can do that by using the above query
# as a subquery in the from clause.
sqldf("
select max(numTitles2)
from
(select authors.au_id, count(*) as numTitles2
from authors join title_authors on authors.au_id = title_authors.au_id
group by authors.au_id, au_fname, au_lname)
")
max(numTitles2)
1 4
# Finally we can put this all together to answer our question.
# The following is the FULL ANSWER to the original question.
# It uses the above query as a subquery in the having clause (see
# the code below). Note that in the following code there is a
# subquery inside of a subquery.
#
# FINAL ANSWER:
sqldf("
select authors.au_id, au_fname, au_lname, count(*) as numTitles
from authors join title_authors on authors.au_id = title_authors.au_id
group by authors.au_id, au_fname, au_lname
having numTitles =
(select max(numTitles2)
from
(select authors.au_id, count(*) as numTitles2
from authors join title_authors on authors.au_id = title_authors.au_id
group by authors.au_id, au_fname, au_lname))
order by au_lname, au_fname
")
au_id au_fname au_lname numTitles
1 A02 Wendy Heydemark 4
2 A04 Klee Hull 4