# Load necessary library
library(readr)
library(sqldf)
## Warning: package 'sqldf' was built under R version 4.4.2
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 4.4.2
## Loading required package: proto
## Warning: package 'proto' was built under R version 4.4.2
## Loading required package: RSQLite
## Warning: package 'RSQLite' was built under R version 4.4.2
# Function to load museum data
load_museum_data <- function(file_paths) {
data_list <- list()
for (name in names(file_paths)) {
path <- file_paths[[name]]
tryCatch({
# Attempt to read the CSV file into a dataframe
data_list[[name]] <- read_csv(path)
}, error = function(e) {
# Store the error message if file reading fails
data_list[[name]] <- paste("Error:", e$message)
})
}
return(data_list)
}
# Base path for all files
base_path <- "C:/Users/Nandan Hegde/OneDrive/Documents/MSU_Grad_Studies/Semester2/STT811/STT811_ICA/Museums/"
# File paths defined more efficiently
file_paths <- list(
artist = paste0(base_path, "artist.csv"),
canvas_size = paste0(base_path, "canvas_size.csv"),
museum_hours = paste0(base_path, "museum_hours.csv"),
museum = paste0(base_path, "museum.csv"),
product_size = paste0(base_path, "product_size.csv"),
subject = paste0(base_path, "subject.csv"),
work = paste0(base_path, "work.csv")
)
# Load the data
museum_data <- load_museum_data(file_paths)
## Rows: 421 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): full_name, first_name, middle_names, last_name, nationality, style
## dbl (3): artist_id, birth, death
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 200 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): label
## dbl (3): size_id, width, height
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 351 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): day, close
## dbl (1): museum_id
## time (1): open
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 57 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): name, address, city, state, postal, country, phone, url
## dbl (1): museum_id
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 110347 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): work_id, size_id, sale_price, regular_price
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 6771 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): subject
## dbl (1): work_id
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 14776 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): name, style
## dbl (3): work_id, artist_id, museum_id
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Display column names for each table
for (name in names(museum_data)) {
if (is.data.frame(museum_data[[name]])) {
cat(name, "columns:", paste(colnames(museum_data[[name]]), collapse = ", "), "\n")
} else {
cat(name, "could not be loaded:", museum_data[[name]], "\n")
}
}
## artist columns: artist_id, full_name, first_name, middle_names, last_name, nationality, style, birth, death
## canvas_size columns: size_id, width, height, label
## museum_hours columns: museum_id, day, open, close
## museum columns: museum_id, name, address, city, state, postal, country, phone, url
## product_size columns: work_id, size_id, sale_price, regular_price
## subject columns: work_id, subject
## work columns: work_id, name, artist_id, style, museum_id
artist <- museum_data$artist
canvas_size <- museum_data$canvas_size
museum_hours <- museum_data$museum_hours
museum <- museum_data$museum
product_size <- museum_data$product_size
subject <- museum_data$subject
work <- museum_data$work
# Question 1
museums_in_france <- sqldf("SELECT name FROM museum WHERE country = 'France'")
cat("Museums in France:\n")
## Museums in France:
print(museums_in_france)
## name
## 1 Museum of Grenoble
## 2 Musée des Beaux-Arts de Quimper
## 3 Musée du Louvre
## 4 Musée d'Orsay
## 5 Museum of Fine Arts of Nancy
## 6 Musée Marmottan Monet
## 7 Army Museum
# Question 2
most_works <- sqldf(
"SELECT m.name, COUNT(w.work_id) AS work_count
FROM work w
JOIN museum m ON w.museum_id = m.museum_id
GROUP BY m.name
ORDER BY work_count DESC
LIMIT 1"
)
cat("Museum with the most works:\n")
## Museum with the most works:
print(most_works)
## name work_count
## 1 The Metropolitan Museum of Art 939
test_museum <- sqldf(
"WITH avg_price_museum AS (
SELECT m.museum_id,w.work_id, m.name, AVG(ps.sale_price) AS avg_price
FROM artist a
JOIN work w ON a.artist_id = w.artist_id
JOIN product_size ps ON w.work_id = ps.work_id
JOIN museum m ON w.museum_id = m.museum_id
WHERE a.birth >= '1800' AND a.birth < '1900'
AND ps.sale_price IS NOT NULL
GROUP BY m.museum_id, m.name
ORDER BY avg_price DESC
LIMIT 1)
SELECT * FROM avg_price_museum"
)
test_artist <- sqldf("SELECT COUNT(DISTINCT a.artist_id) AS distinct_artist_count
FROM artist a
JOIN work w ON a.artist_id = w.artist_id
WHERE a.birth >= '1900' AND a.birth < '2000' AND w.museum_id = (SELECT museum_id FROM test_museum)")
# Question 3
result <- sqldf(
"WITH avg_price_museum AS (
SELECT m.museum_id, m.name, AVG(ps.sale_price) AS avg_price
FROM artist a
JOIN work w ON a.artist_id = w.artist_id
JOIN product_size ps ON w.work_id = ps.work_id
JOIN museum m ON w.museum_id = m.museum_id
WHERE a.birth >= '1800' AND a.birth < '1900'
AND ps.sale_price IS NOT NULL
GROUP BY m.museum_id, m.name
ORDER BY avg_price DESC
LIMIT 1
),
distinct_artists AS (
SELECT COUNT(DISTINCT a.artist_id) AS distinct_artist_count_20th_century
FROM artist a
JOIN work w ON a.artist_id = w.artist_id
WHERE a.birth >= '1900' AND a.birth < '2000' AND w.museum_id = (SELECT museum_id FROM avg_price_museum)
)
SELECT apm.name AS museum_name, apm.avg_price, distinct_artist_count_20th_century
FROM avg_price_museum apm, distinct_artists da"
)
cat("Museum with highest average price and 20th-century artist contributions:\n")
## Museum with highest average price and 20th-century artist contributions:
print(result)
## museum_name avg_price distinct_artist_count_20th_century
## 1 Museum of Fine Arts 546.6667 0