ICA1.knit

# Load necessary library
library(readr)
library(sqldf)

## Warning: package 'sqldf' was built under R version 4.4.2

## Loading required package: gsubfn

## Warning: package 'gsubfn' was built under R version 4.4.2

## Loading required package: proto

## Warning: package 'proto' was built under R version 4.4.2

## Loading required package: RSQLite

## Warning: package 'RSQLite' was built under R version 4.4.2

# Function to load museum data
load_museum_data <- function(file_paths) {
  data_list <- list()

  for (name in names(file_paths)) {
    path <- file_paths[[name]]
    tryCatch({
      # Attempt to read the CSV file into a dataframe
      data_list[[name]] <- read_csv(path)
    }, error = function(e) {
      # Store the error message if file reading fails
      data_list[[name]] <- paste("Error:", e$message)
    })
  }

  return(data_list)
}

# Base path for all files
base_path <- "C:/Users/Nandan Hegde/OneDrive/Documents/MSU_Grad_Studies/Semester2/STT811/STT811_ICA/Museums/"

# File paths defined more efficiently
file_paths <- list(
  artist = paste0(base_path, "artist.csv"),
  canvas_size = paste0(base_path, "canvas_size.csv"),
  museum_hours = paste0(base_path, "museum_hours.csv"),
  museum = paste0(base_path, "museum.csv"),
  product_size = paste0(base_path, "product_size.csv"),
  subject = paste0(base_path, "subject.csv"),
  work = paste0(base_path, "work.csv")
)

# Load the data
museum_data <- load_museum_data(file_paths)

## Rows: 421 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): full_name, first_name, middle_names, last_name, nationality, style
## dbl (3): artist_id, birth, death
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 200 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): label
## dbl (3): size_id, width, height
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 351 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): day, close
## dbl  (1): museum_id
## time (1): open
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 57 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): name, address, city, state, postal, country, phone, url
## dbl (1): museum_id
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 110347 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): work_id, size_id, sale_price, regular_price
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 6771 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): subject
## dbl (1): work_id
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 14776 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): name, style
## dbl (3): work_id, artist_id, museum_id
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Display column names for each table
for (name in names(museum_data)) {
  if (is.data.frame(museum_data[[name]])) {
    cat(name, "columns:", paste(colnames(museum_data[[name]]), collapse = ", "), "\n")
  } else {
    cat(name, "could not be loaded:", museum_data[[name]], "\n")
  }
}

## artist columns: artist_id, full_name, first_name, middle_names, last_name, nationality, style, birth, death 
## canvas_size columns: size_id, width, height, label 
## museum_hours columns: museum_id, day, open, close 
## museum columns: museum_id, name, address, city, state, postal, country, phone, url 
## product_size columns: work_id, size_id, sale_price, regular_price 
## subject columns: work_id, subject 
## work columns: work_id, name, artist_id, style, museum_id

artist <- museum_data$artist
canvas_size <- museum_data$canvas_size
museum_hours <- museum_data$museum_hours
museum <- museum_data$museum
product_size <- museum_data$product_size
subject <- museum_data$subject
work <- museum_data$work

# Question 1

museums_in_france <- sqldf("SELECT name FROM museum WHERE country = 'France'")
cat("Museums in France:\n")

## Museums in France:

print(museums_in_france)

##                              name
## 1              Museum of Grenoble
## 2 Musée des Beaux-Arts de Quimper
## 3                 Musée du Louvre
## 4                   Musée d'Orsay
## 5    Museum of Fine Arts of Nancy
## 6           Musée Marmottan Monet
## 7                     Army Museum

# Question 2

most_works <- sqldf(
  "SELECT m.name, COUNT(w.work_id) AS work_count
   FROM work w
   JOIN museum m ON w.museum_id = m.museum_id
   GROUP BY m.name
   ORDER BY work_count DESC
   LIMIT 1"
)
cat("Museum with the most works:\n")

## Museum with the most works:

print(most_works)

##                             name work_count
## 1 The Metropolitan Museum of Art        939

test_museum <- sqldf(
"WITH avg_price_museum AS (
SELECT m.museum_id,w.work_id, m.name, AVG(ps.sale_price) AS avg_price
FROM artist a
JOIN work w ON a.artist_id = w.artist_id
JOIN product_size ps ON w.work_id = ps.work_id
JOIN museum m ON w.museum_id = m.museum_id
WHERE a.birth >= '1800' AND a.birth < '1900'
AND ps.sale_price IS NOT NULL
GROUP BY m.museum_id, m.name
ORDER BY avg_price DESC
LIMIT 1)

SELECT * FROM avg_price_museum"
)

 test_artist <- sqldf("SELECT COUNT(DISTINCT a.artist_id) AS distinct_artist_count
 FROM artist a
 JOIN work w ON a.artist_id = w.artist_id
 WHERE a.birth >= '1900' AND a.birth < '2000' AND w.museum_id = (SELECT museum_id FROM test_museum)")

# Question 3

result <- sqldf(
"WITH avg_price_museum AS (
SELECT m.museum_id, m.name, AVG(ps.sale_price) AS avg_price
FROM artist a
JOIN work w ON a.artist_id = w.artist_id
JOIN product_size ps ON w.work_id = ps.work_id
JOIN museum m ON w.museum_id = m.museum_id
WHERE a.birth >= '1800' AND a.birth < '1900'
AND ps.sale_price IS NOT NULL
GROUP BY m.museum_id, m.name
ORDER BY avg_price DESC
LIMIT 1
 ),
distinct_artists AS (
 SELECT COUNT(DISTINCT a.artist_id) AS distinct_artist_count_20th_century
 FROM artist a
 JOIN work w ON a.artist_id = w.artist_id
 WHERE a.birth >= '1900' AND a.birth < '2000' AND w.museum_id = (SELECT museum_id FROM avg_price_museum)
)
SELECT apm.name AS museum_name, apm.avg_price, distinct_artist_count_20th_century
FROM avg_price_museum apm, distinct_artists da"
)

cat("Museum with highest average price and 20th-century artist contributions:\n")

## Museum with highest average price and 20th-century artist contributions:

print(result)

##           museum_name avg_price distinct_artist_count_20th_century
## 1 Museum of Fine Arts  546.6667                                  0