Question 1 (35%)

Fix the code below

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# Question 1: mpg_data creation
q1_mpg_data <- tibble(
  id = 1:20,
  odd = id %% 2 == 1,
  manufacturer = rep(c('audi', 'ford'), 10),
  year = rep(1999:2008, each = 2),
  Efficiency = ifelse(rep(c('audi', 'ford'), 10) == "ford", 20, 30) * (1:20) + 5
)

# Question 2: model info
q2_model_info <- tibble(
  manufacturer = c('audi', 'ford', 'honda', NA),
  model = c('a4', 'f-150', 'civic', 'accord')
)

# Merge and analyze
q1_merged <- q1_mpg_data %>%
  filter(odd) %>%
  inner_join(q2_model_info, by = "manufacturer") %>%
  mutate(year_and_model = paste(year, model, sep = "_")) %>%
  group_by(year_and_model) %>%
  summarise(efficiency_avg = mean(Efficiency))

print(q1_merged)

## # A tibble: 10 × 2
##    year_and_model efficiency_avg
##    <chr>                   <dbl>
##  1 1999_a4                    35
##  2 2000_a4                    95
##  3 2001_a4                   155
##  4 2002_a4                   215
##  5 2003_a4                   275
##  6 2004_a4                   335
##  7 2005_a4                   395
##  8 2006_a4                   455
##  9 2007_a4                   515
## 10 2008_a4                   575

# Question 2 (35 points)

#Complete the following tasks using appropriate R functions and the mpg dataset from ggplot2.
 #   Filter for vehicles with highway mileage (hwy) above 30.
  #  Arrange the result by descending highway mileage.
   # Create a new field called difference that shows the difference between highway (hwy) and city mileage (cty).
   # Create a new field called title that shows both the manufacturer and model (i.e., toyota rav4)
   # Select only the title, cty, difference, and hwy columns.
   # Use `esq` or `plot` to show the hwy versus cty mileage in a scatterplot
  

# Load the cars dataset
library(tidyverse)
data(mpg)
t_raw_mpg <- tibble(mpg)

t_clean <- mpg %>%
  filter(hwy > 30) %>%
  arrange(desc(hwy)) %>%
  mutate(
    difference = hwy - cty,
    title = paste(manufacturer, model)
  ) %>%
  select(title, cty, difference, hwy)



ggplot(t_clean) +
  aes(x = hwy, y = cty, colour = title) +
  geom_point() +
  scale_color_hue(direction = 1) +
  theme_minimal()

Q2a (30 points)

Create a new tibble using the mpg dataset (do not re-use your one from above).

Create a model predicting if each car is a compact or subcompact (i.e. class) Calculate accuracy, precision, and recall. Explain which metric would be the most important for an insurance company trying to identify fraudulent claims for further investigation (assume they prioritize not missing any potential frauds).

t_mpg <- t_raw_mpg %>% 
   filter(class == 'compact' | class == 'subcompact') %>%
  mutate(is_compact = ifelse(class == 'compact', 1, 0))



table(t_mpg$class)

## 
##    compact subcompact 
##         47         35

m <- lm(is_compact ~  manufacturer, data = t_mpg)

summary(m)

## 
## Call:
## lm(formula = is_compact ~ manufacturer, data = t_mpg)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##   -0.7    0.0    0.0    0.0    0.5 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             1.000e+00  7.474e-02  13.380  < 2e-16 ***
## manufacturerford       -1.000e+00  1.220e-01  -8.194 5.57e-12 ***
## manufacturerhonda      -1.000e+00  1.220e-01  -8.194 5.57e-12 ***
## manufacturerhyundai    -1.000e+00  1.325e-01  -7.548 9.25e-11 ***
## manufacturernissan     -8.891e-16  2.179e-01   0.000 1.000000    
## manufacturersubaru     -5.000e-01  1.267e-01  -3.946 0.000179 ***
## manufacturertoyota     -9.656e-16  1.121e-01   0.000 1.000000    
## manufacturervolkswagen -3.000e-01  9.887e-02  -3.034 0.003325 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2895 on 74 degrees of freedom
## Multiple R-squared:  0.6909, Adjusted R-squared:  0.6617 
## F-statistic: 23.63 on 7 and 74 DF,  p-value: < 2.2e-16

hist(t_mpg$hwy)

hist(t_mpg$cty)

table(t_mpg$manufacturer)

## 
##       audi       ford      honda    hyundai     nissan     subaru     toyota 
##         15          9          9          7          2          8         12 
## volkswagen 
##         20

model <- t_mpg %>% 
  mutate(prediction = ifelse(manufacturer == 'audi' | 
                             manufacturer == 'volkswagen' | 
                             manufacturer == 'toyota', 1, 0))

comparison_df <- model %>% 
  mutate(
    TP = ifelse(is_compact == 1 & prediction == 1, 1, 0),
    FP = ifelse(is_compact == 0 & prediction == 1, 1, 0),
    FN = ifelse(is_compact == 1 & prediction == 0, 1, 0),
    TN = ifelse(is_compact == 0 & prediction == 0, 1, 0)
  )

TP <- sum(comparison_df$TP)
FP <- sum(comparison_df$FP)
FN <- sum(comparison_df$FN)
TN <- sum(comparison_df$TN)


accuracy  <- (TP + TN) / (TP + FP + FN + TN)
precision <- TP / (TP + FP)
recall    <- TP / (TP + FN)

print(paste0('Accuracy is ', accuracy))

## [1] "Accuracy is 0.853658536585366"

print(paste0('Precision is ', precision))

## [1] "Precision is 0.872340425531915"

print(paste0('Recall is ', recall))

## [1] "Recall is 0.872340425531915"

R Notebook

Spring 2025, Exam 1

Question 1 (35%)

Q2a (30 points)