Your name: Kevin McDonald
Fix the code below
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Question 1: mpg_data creation
q1_mpg_data <- tibble(
id = 1:20,
odd = id %% 2 == 1,
manufacturer = rep(c('audi', 'ford'), 10),
year = rep(1999:2008, each = 2),
Efficiency = ifelse(rep(c('audi', 'ford'), 10) == "ford", 20, 30) * (1:20) + 5
)
# Question 2: model info
q2_model_info <- tibble(
manufacturer = c('audi', 'ford', 'honda', NA),
model = c('a4', 'f-150', 'civic', 'accord')
)
# Merge and analyze
q1_merged <- q1_mpg_data %>%
filter(odd) %>%
inner_join(q2_model_info, by = "manufacturer") %>%
mutate(year_and_model = paste(year, model, sep = "_")) %>%
group_by(year_and_model) %>%
summarise(efficiency_avg = mean(Efficiency))
print(q1_merged)
## # A tibble: 10 × 2
## year_and_model efficiency_avg
## <chr> <dbl>
## 1 1999_a4 35
## 2 2000_a4 95
## 3 2001_a4 155
## 4 2002_a4 215
## 5 2003_a4 275
## 6 2004_a4 335
## 7 2005_a4 395
## 8 2006_a4 455
## 9 2007_a4 515
## 10 2008_a4 575
# Question 2 (35 points)
#Complete the following tasks using appropriate R functions and the mpg dataset from ggplot2.
# Filter for vehicles with highway mileage (hwy) above 30.
# Arrange the result by descending highway mileage.
# Create a new field called difference that shows the difference between highway (hwy) and city mileage (cty).
# Create a new field called title that shows both the manufacturer and model (i.e., toyota rav4)
# Select only the title, cty, difference, and hwy columns.
# Use `esq` or `plot` to show the hwy versus cty mileage in a scatterplot
# Load the cars dataset
library(tidyverse)
data(mpg)
t_raw_mpg <- tibble(mpg)
t_clean <- mpg %>%
filter(hwy > 30) %>%
arrange(desc(hwy)) %>%
mutate(
difference = hwy - cty,
title = paste(manufacturer, model)
) %>%
select(title, cty, difference, hwy)
ggplot(t_clean) +
aes(x = hwy, y = cty, colour = title) +
geom_point() +
scale_color_hue(direction = 1) +
theme_minimal()
Create a new tibble using the mpg dataset (do not re-use your one from above).
Create a model predicting if each car is a compact or subcompact (i.e. class) Calculate accuracy, precision, and recall. Explain which metric would be the most important for an insurance company trying to identify fraudulent claims for further investigation (assume they prioritize not missing any potential frauds).
t_mpg <- t_raw_mpg %>%
filter(class == 'compact' | class == 'subcompact') %>%
mutate(is_compact = ifelse(class == 'compact', 1, 0))
table(t_mpg$class)
##
## compact subcompact
## 47 35
m <- lm(is_compact ~ manufacturer, data = t_mpg)
summary(m)
##
## Call:
## lm(formula = is_compact ~ manufacturer, data = t_mpg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7 0.0 0.0 0.0 0.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.000e+00 7.474e-02 13.380 < 2e-16 ***
## manufacturerford -1.000e+00 1.220e-01 -8.194 5.57e-12 ***
## manufacturerhonda -1.000e+00 1.220e-01 -8.194 5.57e-12 ***
## manufacturerhyundai -1.000e+00 1.325e-01 -7.548 9.25e-11 ***
## manufacturernissan -8.891e-16 2.179e-01 0.000 1.000000
## manufacturersubaru -5.000e-01 1.267e-01 -3.946 0.000179 ***
## manufacturertoyota -9.656e-16 1.121e-01 0.000 1.000000
## manufacturervolkswagen -3.000e-01 9.887e-02 -3.034 0.003325 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2895 on 74 degrees of freedom
## Multiple R-squared: 0.6909, Adjusted R-squared: 0.6617
## F-statistic: 23.63 on 7 and 74 DF, p-value: < 2.2e-16
hist(t_mpg$hwy)
hist(t_mpg$cty)
table(t_mpg$manufacturer)
##
## audi ford honda hyundai nissan subaru toyota
## 15 9 9 7 2 8 12
## volkswagen
## 20
model <- t_mpg %>%
mutate(prediction = ifelse(manufacturer == 'audi' |
manufacturer == 'volkswagen' |
manufacturer == 'toyota', 1, 0))
comparison_df <- model %>%
mutate(
TP = ifelse(is_compact == 1 & prediction == 1, 1, 0),
FP = ifelse(is_compact == 0 & prediction == 1, 1, 0),
FN = ifelse(is_compact == 1 & prediction == 0, 1, 0),
TN = ifelse(is_compact == 0 & prediction == 0, 1, 0)
)
TP <- sum(comparison_df$TP)
FP <- sum(comparison_df$FP)
FN <- sum(comparison_df$FN)
TN <- sum(comparison_df$TN)
accuracy <- (TP + TN) / (TP + FP + FN + TN)
precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
print(paste0('Accuracy is ', accuracy))
## [1] "Accuracy is 0.853658536585366"
print(paste0('Precision is ', precision))
## [1] "Precision is 0.872340425531915"
print(paste0('Recall is ', recall))
## [1] "Recall is 0.872340425531915"