library(jsonlite)
17 17. JSON Parsing Practice in R
These exercises ask you to retrieve specific information from JSON that has been processed by the fromJSON() function.
Each exercise includes some JSON and then specific questions that ask you to retrieve specific data from that JSON.
The questions are rated easy,medium,harder based on the following:
easy questions: data is readily accessible by simply using list, dataframe and matrix notation
medium questions: you might need additional functions such as unlist() or others.
harder questions: require to you to use either apply functions or loops to solve the problem.
17.1 Exercise: Basic JSON Object Access
Answer the questions below by referring to the following JSON response from an API.
Click to see JSON
# API response stored in variable 'weather_data'
<- fromJSON('{
weather_data "location": "Seattle",
"current": {
"temperature": 12.5,
"conditions": "Cloudy",
"wind": {
"speed": 15.5,
"direction": "NW"
}
},
"forecast": [
{"day": "Monday", "high": 14, "low": 8},
{"day": "Tuesday", "high": 16, "low": 9},
{"day": "Wednesday", "high": 15, "low": 7}
]
}')
Click to see fromJSON() output
# THE STRUCTURE
str(weather_data)
List of 3
$ location: chr "Seattle"
$ current :List of 3
..$ temperature: num 12.5
..$ conditions : chr "Cloudy"
..$ wind :List of 2
.. ..$ speed : num 15.5
.. ..$ direction: chr "NW"
$ forecast:'data.frame': 3 obs. of 3 variables:
..$ day : chr [1:3] "Monday" "Tuesday" "Wednesday"
..$ high: int [1:3] 14 16 15
..$ low : int [1:3] 8 9 7
# THE DATA
weather_data
$location
[1] "Seattle"
$current
$current$temperature
[1] 12.5
$current$conditions
[1] "Cloudy"
$current$wind
$current$wind$speed
[1] 15.5
$current$wind$direction
[1] "NW"
$forecast
day high low
1 Monday 14 8
2 Tuesday 16 9
3 Wednesday 15 7
17.1.1 Question (easy): Write R code to extract the current temperature.
Click for answer
$current$temperature weather_data
[1] 12.5
# or alternatively
"current"]][["temperature"]] weather_data[[
[1] 12.5
17.1.2 Question (easy): Write R code to create a vector of all forecasted high temperatures.
Click for answer
$forecast$high weather_data
[1] 14 16 15
# or
$forecast[,"high"] weather_data
[1] 14 16 15
17.2 Exercise: E-commerce Product Data
Answer the questions below by referring to the following JSON response from an e-commerce API.
Click to see JSON
# API response stored in variable 'product_data'
<- fromJSON('{
product_data "store": "TechStore",
"products": [
{
"id": "p123",
"name": "Wireless Headphones",
"price": 89.99,
"stock": 45,
"colors": ["black", "white", "blue"],
"specs": {
"battery": "20 hours",
"connectivity": "Bluetooth 5.0",
"weight": "250g"
}
},
{
"id": "p124",
"name": "Smart Watch",
"price": 199.99,
"stock": 28,
"colors": ["black", "silver"],
"specs": {
"battery": "48 hours",
"connectivity": "Bluetooth 5.0",
"weight": "45g"
}
},
{
"id": "p125",
"name": "Wireless Charger",
"price": 29.99,
"stock": 120,
"colors": ["black", "white"],
"specs": {
"input": "QC 3.0",
"output": "15W",
"weight": "80g"
}
}
],
"last_updated": "2024-02-20"
}')
Click to see fromJSON() output
# THE STRUCTURE
str(product_data)
List of 3
$ store : chr "TechStore"
$ products :'data.frame': 3 obs. of 6 variables:
..$ id : chr [1:3] "p123" "p124" "p125"
..$ name : chr [1:3] "Wireless Headphones" "Smart Watch" "Wireless Charger"
..$ price : num [1:3] 90 200 30
..$ stock : int [1:3] 45 28 120
..$ colors:List of 3
.. ..$ : chr [1:3] "black" "white" "blue"
.. ..$ : chr [1:2] "black" "silver"
.. ..$ : chr [1:2] "black" "white"
..$ specs :'data.frame': 3 obs. of 5 variables:
.. ..$ battery : chr [1:3] "20 hours" "48 hours" NA
.. ..$ connectivity: chr [1:3] "Bluetooth 5.0" "Bluetooth 5.0" NA
.. ..$ weight : chr [1:3] "250g" "45g" "80g"
.. ..$ input : chr [1:3] NA NA "QC 3.0"
.. ..$ output : chr [1:3] NA NA "15W"
$ last_updated: chr "2024-02-20"
# THE DATA
product_data
$store
[1] "TechStore"
$products
id name price stock colors specs.battery
1 p123 Wireless Headphones 89.99 45 black, white, blue 20 hours
2 p124 Smart Watch 199.99 28 black, silver 48 hours
3 p125 Wireless Charger 29.99 120 black, white <NA>
specs.connectivity specs.weight specs.input specs.output
1 Bluetooth 5.0 250g <NA> <NA>
2 Bluetooth 5.0 45g <NA> <NA>
3 <NA> 80g QC 3.0 15W
$last_updated
[1] "2024-02-20"
17.2.1 Question (easy): Write R code to extract the name and price of the second product.
Click for answer
# Extract name
$products$name[2] product_data
[1] "Smart Watch"
# or
$products[2, "name"] product_data
[1] "Smart Watch"
# Extract price
$products$price[2] product_data
[1] 199.99
# or
$products[2, "price"] product_data
[1] 199.99
# Combined as a named vector
c(name = product_data$products$name[2], price = product_data$products$price[2])
name price
"Smart Watch" "199.99"
17.2.2 Question (easy): Write R code to extract all available colors for the Wireless Headphones product.
Click for answer
# Find the row index for the Wireless Headphones
<- which(product_data$products$name == "Wireless Headphones")
headphone_index
# Extract the colors
$products$colors[[headphone_index]] product_data
[1] "black" "white" "blue"
17.2.3 Question (easy): Write R code to create a data frame showing each product’s name, price, and stock.
Click for answer
data.frame(
name = product_data$products$name,
price = product_data$products$price,
stock = product_data$products$stock
)
name price stock
1 Wireless Headphones 89.99 45
2 Smart Watch 199.99 28
3 Wireless Charger 29.99 120
# Alternatively
$products[, c("name", "price", "stock")] product_data
name price stock
1 Wireless Headphones 89.99 45
2 Smart Watch 199.99 28
3 Wireless Charger 29.99 120
17.4 Exercise: Nested Arrays
Answer the questions below by referring to the following JSON response from an API.
Click to see JSON
# API response stored in variable 'library_data'
<- fromJSON('{
library_data "library": {
"name": "Central Library",
"books": [
{
"title": "Data Science Basics",
"authors": ["Smith, J.", "Jones, K."],
"categories": ["programming", "statistics"],
"ratings": [4.5, 4.8, 4.2]
},
{
"title": "Statistics Using R",
"authors": ["Smith, J.", "Jones, K."],
"categories": ["statistics", "R"],
"ratings": [4.5, 4.8, 4.2]
},
{
"title": "R Programming Guide",
"authors": ["Wilson, M."],
"categories": ["programming", "R"],
"ratings": [4.7, 4.6, 4.9, 4.5]
}
]
}
}')
Click to see fromJSON() output
# THE STRUCTURE
str(library_data)
List of 1
$ library:List of 2
..$ name : chr "Central Library"
..$ books:'data.frame': 3 obs. of 4 variables:
.. ..$ title : chr [1:3] "Data Science Basics" "Statistics Using R" "R Programming Guide"
.. ..$ authors :List of 3
.. .. ..$ : chr [1:2] "Smith, J." "Jones, K."
.. .. ..$ : chr [1:2] "Smith, J." "Jones, K."
.. .. ..$ : chr "Wilson, M."
.. ..$ categories:List of 3
.. .. ..$ : chr [1:2] "programming" "statistics"
.. .. ..$ : chr [1:2] "statistics" "R"
.. .. ..$ : chr [1:2] "programming" "R"
.. ..$ ratings :List of 3
.. .. ..$ : num [1:3] 4.5 4.8 4.2
.. .. ..$ : num [1:3] 4.5 4.8 4.2
.. .. ..$ : num [1:4] 4.7 4.6 4.9 4.5
# THE DATA
library_data
$library
$library$name
[1] "Central Library"
$library$books
title authors categories
1 Data Science Basics Smith, J., Jones, K. programming, statistics
2 Statistics Using R Smith, J., Jones, K. statistics, R
3 R Programming Guide Wilson, M. programming, R
ratings
1 4.5, 4.8, 4.2
2 4.5, 4.8, 4.2
3 4.7, 4.6, 4.9, 4.5
17.4.1 Question (medium): Write code to create a vector of all unique book categories
across all books.
Click for answer
unique(unlist(library_data$library$books$categories))
[1] "programming" "statistics" "R"
17.4.2 Question (harder): get named vector of ratings for each book
Calculate the average rating for each book and store it in a named vector where names are book titles.
Click for answer
= sapply(library_data$library$books$ratings, mean)
bookTitles names(bookTitles) = library_data$library$books$title
bookTitles
Data Science Basics Statistics Using R R Programming Guide
4.500 4.500 4.675
17.5 Exercise: Complex Nested Structures
Answer the questions below by referring to the following JSON response from a social media API.
The API was called with the following URL: https://api.somesite.com/userinfo/Alex Chen
Click to see JSON
# API response stored in variable 'social_data'
# The response includes info about a single user "Alex Chen"
<- fromJSON('{
social_data "user": {
"id": "u123",
"name": "Alex Chen",
"posts": [
{
"id": "p1",
"content": "Learning R!",
"timestamp": "2024-02-15",
"comments": [
{
"user": "Carol Wu",
"text": "Check out tidyverse",
"likes": 5
},
{
"user": "Bob Smith",
"text": "Great choice!",
"likes": 3
},
{
"user": "Carol Wu",
"text": "tidyverse makes many things easier",
"likes": 2
}
]
},
{
"id": "p2",
"content": "JSON parsing is fun",
"timestamp": "2024-02-16",
"comments": [
{
"user": "Mike Jones",
"text": "Try jsonlite",
"likes": 4
}
]
},
{
"id": "p3",
"content": "Coffee is key :)",
"timestamp": "2024-02-16",
"comments": [
{
"user": "Carol Wu",
"text": "I totally agree!",
"likes": 1
},
{
"user": "Mike Jones",
"text": "yessss",
"likes": 0
}
]
}
]
}
}')
Click to see fromJSON() output
# THE STRUCTURE
str(social_data)
List of 1
$ user:List of 3
..$ id : chr "u123"
..$ name : chr "Alex Chen"
..$ posts:'data.frame': 3 obs. of 4 variables:
.. ..$ id : chr [1:3] "p1" "p2" "p3"
.. ..$ content : chr [1:3] "Learning R!" "JSON parsing is fun" "Coffee is key :)"
.. ..$ timestamp: chr [1:3] "2024-02-15" "2024-02-16" "2024-02-16"
.. ..$ comments :List of 3
.. .. ..$ :'data.frame': 3 obs. of 3 variables:
.. .. .. ..$ user : chr [1:3] "Carol Wu" "Bob Smith" "Carol Wu"
.. .. .. ..$ text : chr [1:3] "Check out tidyverse" "Great choice!" "tidyverse makes many things easier"
.. .. .. ..$ likes: int [1:3] 5 3 2
.. .. ..$ :'data.frame': 1 obs. of 3 variables:
.. .. .. ..$ user : chr "Mike Jones"
.. .. .. ..$ text : chr "Try jsonlite"
.. .. .. ..$ likes: int 4
.. .. ..$ :'data.frame': 2 obs. of 3 variables:
.. .. .. ..$ user : chr [1:2] "Carol Wu" "Mike Jones"
.. .. .. ..$ text : chr [1:2] "I totally agree!" "yessss"
.. .. .. ..$ likes: int [1:2] 1 0
# THE DATA
social_data
$user
$user$id
[1] "u123"
$user$name
[1] "Alex Chen"
$user$posts
id content timestamp
1 p1 Learning R! 2024-02-15
2 p2 JSON parsing is fun 2024-02-16
3 p3 Coffee is key :) 2024-02-16
comments
1 Carol Wu, Bob Smith, Carol Wu, Check out tidyverse, Great choice!, tidyverse makes many things easier, 5, 3, 2
2 Mike Jones, Try jsonlite, 4
3 Carol Wu, Mike Jones, I totally agree!, yessss, 1, 0
17.5.1 Question (harder): Create a data frame containing each comment’s user and number of likes.
Click for answer
# Method 1 - extract vectors separately and combine into a dataframe
= unlist(sapply(social_data$user$posts$comments, function(x) x$user))
users = unlist(sapply(social_data$user$posts$comments, function(x) x$likes))
likes data.frame(user=users, likes=likes)
user likes
1 Carol Wu 5
2 Bob Smith 3
3 Carol Wu 2
4 Mike Jones 4
5 Carol Wu 1
6 Mike Jones 0
# Method 2: Using do.call to combine list of data frames
do.call(rbind, lapply(social_data$user$posts$comments, function(comments) {
commentsc("user", "likes")] }))[,
user likes
1 Carol Wu 5
2 Bob Smith 3
3 Carol Wu 2
4 Mike Jones 4
5 Carol Wu 1
6 Mike Jones 0
# Method 3: Using tidyverse if available
library(tidyverse)
Warning: package 'stringr' was built under R version 4.4.3
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ purrr::flatten() masks jsonlite::flatten()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
bind_rows(social_data$user$posts$comments)[,c("user","likes")]
user likes
1 Carol Wu 5
2 Bob Smith 3
3 Carol Wu 2
4 Mike Jones 4
5 Carol Wu 1
6 Mike Jones 0
17.5.2 Question (harder): function returning all comments by specific user
Create a function that takes the social_data object and a username as parameters, then returns all comments made by that user.
Click for answer
<- function(data, username) {
find_user_comments
# CommentsListDf is a list of dataframes. Each dataframe in the
# list contains the columns: user, text, likes
= data$user$posts$comments
commentsListDf
# commentsList is a list of vectors. The vectors contain just the comments for
# the specified username. There is one vector in the list from each of
# the original dataframes.
= lapply(commentsListDf, function(df){ df[df$user==username, "text"]})
commentsList
# Combine all of the vectors in the list into a single vector.
return( unlist(commentsList) )
}
# Test the function
find_user_comments(social_data, "Carol Wu")
[1] "Check out tidyverse" "tidyverse makes many things easier"
[3] "I totally agree!"
17.6 Exercise: Working with Lists of Data Frames
Answer the questions below by referring to the following JSON response from a fitness tracking API:
Click to see JSON
# API response stored in variable 'fitness_data'
<- fromJSON('{
fitness_data "user_id": "f789",
"tracking_data": {
"daily_steps": [
{"date": "2024-02-14", "steps": 8432, "active_minutes": 45},
{"date": "2024-02-15", "steps": 10234, "active_minutes": 62},
{"date": "2024-02-16", "steps": 7321, "active_minutes": 38}
],
"workouts": [
{
"type": "running",
"sessions": [
{"date": "2024-02-14", "duration": 30, "distance": 5.2},
{"date": "2024-02-14", "duration": 25, "distance": 3.9},
{"date": "2024-02-16", "duration": 25, "distance": 4.1}
]
},
{
"type": "cycling",
"sessions": [
{"date": "2024-02-14", "duration": 30, "distance": 9.1},
{"date": "2024-02-15", "duration": 45, "distance": 15.3}
]
}
]
}
}')
Click to see fromJSON() output
# THE STRUCTURE
str(fitness_data)
List of 2
$ user_id : chr "f789"
$ tracking_data:List of 2
..$ daily_steps:'data.frame': 3 obs. of 3 variables:
.. ..$ date : chr [1:3] "2024-02-14" "2024-02-15" "2024-02-16"
.. ..$ steps : int [1:3] 8432 10234 7321
.. ..$ active_minutes: int [1:3] 45 62 38
..$ workouts :'data.frame': 2 obs. of 2 variables:
.. ..$ type : chr [1:2] "running" "cycling"
.. ..$ sessions:List of 2
.. .. ..$ :'data.frame': 3 obs. of 3 variables:
.. .. .. ..$ date : chr [1:3] "2024-02-14" "2024-02-14" "2024-02-16"
.. .. .. ..$ duration: int [1:3] 30 25 25
.. .. .. ..$ distance: num [1:3] 5.2 3.9 4.1
.. .. ..$ :'data.frame': 2 obs. of 3 variables:
.. .. .. ..$ date : chr [1:2] "2024-02-14" "2024-02-15"
.. .. .. ..$ duration: int [1:2] 30 45
.. .. .. ..$ distance: num [1:2] 9.1 15.3
# THE DATA
fitness_data
$user_id
[1] "f789"
$tracking_data
$tracking_data$daily_steps
date steps active_minutes
1 2024-02-14 8432 45
2 2024-02-15 10234 62
3 2024-02-16 7321 38
$tracking_data$workouts
type sessions
1 running 2024-02-14, 2024-02-14, 2024-02-16, 30, 25, 25, 5.2, 3.9, 4.1
2 cycling 2024-02-14, 2024-02-15, 30, 45, 9.1, 15.3
17.6.1 Question (harder): Create a data frame with total distance covered for each type of workout.
Click for answer
<- data.frame(
workout_summary type = fitness_data$tracking_data$workouts$type,
total_distance = sapply(fitness_data$tracking_data$workouts$sessions,
function(x) sum(x$distance))
)
workout_summary
type total_distance
1 running 13.2
2 cycling 24.4