# These are the package and libraries I needed for the motorins data.
options(repos = list(CRAN = "http://cran.us.r-project.org"))
install.packages("GLMsData")
##
## The downloaded binary packages are in
## /var/folders/c6/d2zbrqtx28x8kgkvh9_m0v800000gn/T//Rtmpe4AX3Z/downloaded_packages
install.packages(c("reshape2", "ggplot2"))
##
## The downloaded binary packages are in
## /var/folders/c6/d2zbrqtx28x8kgkvh9_m0v800000gn/T//Rtmpe4AX3Z/downloaded_packages
install.packages("Hmisc")
##
## The downloaded binary packages are in
## /var/folders/c6/d2zbrqtx28x8kgkvh9_m0v800000gn/T//Rtmpe4AX3Z/downloaded_packages
install.packages(c("tidymodels", "GGally"))
##
## The downloaded binary packages are in
## /var/folders/c6/d2zbrqtx28x8kgkvh9_m0v800000gn/T//Rtmpe4AX3Z/downloaded_packages
install.packages("rsample")
##
## The downloaded binary packages are in
## /var/folders/c6/d2zbrqtx28x8kgkvh9_m0v800000gn/T//Rtmpe4AX3Z/downloaded_packages
install.packages("caret")
##
## The downloaded binary packages are in
## /var/folders/c6/d2zbrqtx28x8kgkvh9_m0v800000gn/T//Rtmpe4AX3Z/downloaded_packages
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom 1.0.8 ✔ recipes 1.3.1
## ✔ dials 1.4.0 ✔ rsample 1.3.0
## ✔ dplyr 1.1.4 ✔ tibble 3.3.0
## ✔ ggplot2 3.5.2 ✔ tidyr 1.3.1
## ✔ infer 1.0.9 ✔ tune 1.3.0
## ✔ modeldata 1.4.0 ✔ workflows 1.2.0
## ✔ parsnip 1.3.2 ✔ workflowsets 1.1.1
## ✔ purrr 1.0.4 ✔ yardstick 1.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(rsample)
library(dplyr)
library(mlbench)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ lubridate 1.9.4 ✔ stringr 1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ stringr::fixed() masks recipes::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ readr::spec() masks yardstick::spec()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(Hmisc)
##
## Attaching package: 'Hmisc'
##
## The following object is masked from 'package:parsnip':
##
## translate
##
## The following objects are masked from 'package:dplyr':
##
## src, summarize
##
## The following objects are masked from 'package:base':
##
## format.pval, units
library(GLMsData)
library(Hmisc)
library(ggplot2)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following objects are masked from 'package:yardstick':
##
## precision, recall, sensitivity, specificity
##
## The following object is masked from 'package:purrr':
##
## lift
#Data for auto insurance in 1977
data(motorins)
# There are 2182 observations and 7 variables
dim(motorins)
## [1] 2182 7
head(motorins)
## Kilometres Zone Bonus Make Insured Claims Payment
## 1 1 1 1 1 455.13 108 392491
## 2 1 1 1 2 69.17 19 46221
## 3 1 1 1 3 72.88 13 15694
## 4 1 1 1 4 1292.39 124 422201
## 5 1 1 1 5 191.01 40 119373
## 6 1 1 1 6 477.66 57 170913
glimpse(motorins)
## Rows: 2,182
## Columns: 7
## $ Kilometres <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ Zone <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ Bonus <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3,…
## $ Make <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2,…
## $ Insured <dbl> 455.13, 69.17, 72.88, 1292.39, 191.01, 477.66, 105.58, 32.5…
## $ Claims <int> 108, 19, 13, 124, 40, 57, 23, 14, 1704, 45, 10, 5, 48, 11, …
## $ Payment <int> 392491, 46221, 15694, 422201, 119373, 170913, 56940, 77487,…
describe(motorins)
## motorins
##
## 7 Variables 2182 Observations
## --------------------------------------------------------------------------------
## Kilometres
## n missing distinct Info Mean pMedian Gmd
## 2182 0 5 0.96 2.986 3 1.596
##
## Value 1 2 3 4 5
## Frequency 439 441 441 434 427
## Proportion 0.201 0.202 0.202 0.199 0.196
##
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## Zone
## n missing distinct Info Mean pMedian Gmd
## 2182 0 7 0.98 3.97 4 2.273
##
## Value 1 2 3 4 5 6 7
## Frequency 315 315 315 315 313 315 294
## Proportion 0.144 0.144 0.144 0.144 0.143 0.144 0.135
##
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## Bonus
## n missing distinct Info Mean pMedian Gmd
## 2182 0 7 0.98 4.015 4 2.287
##
## Value 1 2 3 4 5 6 7
## Frequency 307 312 310 310 313 315 315
## Proportion 0.141 0.143 0.142 0.142 0.143 0.144 0.144
##
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## Make
## n missing distinct Info Mean pMedian Gmd
## 2182 0 9 0.988 4.992 5 2.969
##
## Value 1 2 3 4 5 6 7 8 9
## Frequency 245 245 242 238 244 244 242 237 245
## Proportion 0.112 0.112 0.111 0.109 0.112 0.112 0.111 0.109 0.112
##
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## Insured
## n missing distinct Info Mean pMedian Gmd .05
## 2182 0 2070 1 1092 173 1932 2.311
## .10 .25 .50 .75 .90 .95
## 5.701 21.610 81.525 389.783 1688.186 4711.983
##
## lowest : 0.01 0.03 0.04 0.08 0.09
## highest: 60140.2 65617.5 79615 121293 127687
## --------------------------------------------------------------------------------
## Claims
## n missing distinct Info Mean pMedian Gmd .05
## 2182 0 256 0.993 51.87 10 89.6 0.0
## .10 .25 .50 .75 .90 .95
## 0.0 1.0 5.0 21.0 87.9 233.9
##
## lowest : 0 1 2 3 4, highest: 2087 2127 2548 2894 3338
## --------------------------------------------------------------------------------
## Payment
## n missing distinct Info Mean pMedian Gmd .05
## 2182 0 1752 0.995 257008 51132 445992 0
## .10 .25 .50 .75 .90 .95
## 0 2989 27404 111954 439778 1102675
##
## lowest : 0 72 148 303 309
## highest: 10315455 10518198 13203616 15540162 18245026
## --------------------------------------------------------------------------------
# View the description, structure and summary
?motorins
# Description Provided by R
# The data give details of third-party motor insurance claims in Sweden for the year 1977.
# Format
# A data frame with 2182 observations on the following 7 variables
#
# Kilometres
# the number of kilometres travelled per year; a numeric vector with levels 1 (less than 10000), 2 (from 10000 to 15000), 3 (15000 to 20000), 4 (20000 to 25000) or 5 (more than 25000)
#
# Zone
# geographical zone (only in motorins); a numeric vector with levels 1 to 7 ()see Details below)
#
# Bonus
# no claim bonus; a numeric vector equal to the number of years plus one since the last claim
#
# Make
# the make of vehicle; a numeric vector with levels from 1 to 8 representing eight common cart models, and 9 representing all other models
#
# Insured
# the number of insured in policy-years; a numeric vector
#
# Claims
# the number of claims; a numeric vector
#
# Payment
# the total value of payments in Skoner; a numeric vector
#
# Details
# For variable Zone, the geographical zones are:
#
# 1 Stockholm, Goteborg, Malmo with surroundings
# 2 Other large cities and surroundings
# 3 Small cities in northern Sweden
# 4 Small cities in southern Sweden
# 5 Rural areas in northern Sweden
# 6 Rural areas in southern Sweden
# 7 Gotland
#
# “In Sweden all motor insurance companies apply identical risk arguments to classify customers, and thus their portfolios and their claims statistics can be combined. The data were compiled by a Swedish Committee on the Analysis of Risk Premium in Motor Insurance. The Committee was asked to look into the problem of analyzing the real influence on claims of the risk arguments and to compare this structure with the actual tariff” (Andrews and Herzberg (1985), p. 413).
#
# Make 4 is the Volkswagen 1200, which was discontinued shortly after 1977. The other makes could not be identified because of the potential for the data to impact on sales of those cars.
str(motorins)
## 'data.frame': 2182 obs. of 7 variables:
## $ Kilometres: int 1 1 1 1 1 1 1 1 1 1 ...
## $ Zone : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Bonus : int 1 1 1 1 1 1 1 1 1 2 ...
## $ Make : int 1 2 3 4 5 6 7 8 9 1 ...
## $ Insured : num 455.1 69.2 72.9 1292.4 191 ...
## $ Claims : int 108 19 13 124 40 57 23 14 1704 45 ...
## $ Payment : int 392491 46221 15694 422201 119373 170913 56940 77487 6805992 214011 ...
summary(motorins)
## Kilometres Zone Bonus Make
## Min. :1.000 Min. :1.00 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.00 1st Qu.:2.000 1st Qu.:3.000
## Median :3.000 Median :4.00 Median :4.000 Median :5.000
## Mean :2.986 Mean :3.97 Mean :4.015 Mean :4.992
## 3rd Qu.:4.000 3rd Qu.:6.00 3rd Qu.:6.000 3rd Qu.:7.000
## Max. :5.000 Max. :7.00 Max. :7.000 Max. :9.000
## Insured Claims Payment
## Min. : 0.01 Min. : 0.00 Min. : 0
## 1st Qu.: 21.61 1st Qu.: 1.00 1st Qu.: 2989
## Median : 81.53 Median : 5.00 Median : 27404
## Mean : 1092.19 Mean : 51.87 Mean : 257008
## 3rd Qu.: 389.78 3rd Qu.: 21.00 3rd Qu.: 111954
## Max. :127687.27 Max. :3338.00 Max. :18245026
# This will show the factor variables (Kilometres, Zone, Make)
sapply(motorins, is.factor)
## Kilometres Zone Bonus Make Insured Claims Payment
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# Display in a table with the proper labels
Zone_labels <- c(
"1" = "Stockholm, Göteborg, Malmö & surroundings",
"2" = "Other large cities & surroundings",
"3" = "Small cities (northern Sweden)",
"4" = "Small cities (southern Sweden)",
"5" = "Rural areas (northern Sweden)",
"6" = "Rural areas (southern Sweden)",
"7" = "Gotland"
)
# Convert numeric Zone to labeled factor
motorins$ZoneLabel <- factor(as.character(motorins$Zone),
levels = names(Zone_labels),
labels = Zone_labels)
table(motorins$ZoneLabel)
##
## Stockholm, Göteborg, Malmö & surroundings
## 315
## Other large cities & surroundings
## 315
## Small cities (northern Sweden)
## 315
## Small cities (southern Sweden)
## 315
## Rural areas (northern Sweden)
## 313
## Rural areas (southern Sweden)
## 315
## Gotland
## 294
# This will plot claims vs insured after making numeric
motor_raw <- motorins %>%
mutate(
Kilometres = as.numeric(as.character(Kilometres)),
Zone = as.numeric(as.character(Zone)),
Make = as.numeric(as.character(Make))
)
ggplot(motor_raw, aes(x = Insured, y = Claims)) +
geom_point(alpha = 0.5, color = "blue") +
geom_smooth(method = "lm", color = "darkblue", se = FALSE) +
labs(
title = "Claims vs Insured (Number of Policies)",
x = "Insured (policy-years)",
y = "Number of Claims"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

#This will plot payment vs insured
ggplot(motor_raw, aes(x = Insured, y = Payment)) +
geom_point(alpha = 0.5, color = "darkgreen") +
geom_smooth(method = "lm", color = "forestgreen", se = FALSE) +
labs(
title = "Payment vs Insured (Total Claim Costs)",
x = "Insured (policy-years)",
y = "Payment (SEK)"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

#Summary information for the payment variable
x <- motorins$Payment
mean(x, na.rm = TRUE)
## [1] 257007.6
median(x, na.rm = TRUE)
## [1] 27403.5
min(x, na.rm = TRUE)
## [1] 0
max(x, na.rm = TRUE)
## [1] 18245026
quantile(x, probs = c(0.25, 0.75), na.rm = TRUE)
## 25% 75%
## 2988.75 111953.75
summary(x) # gives Min, 1st Qu., Median, Mean, 3rd Qu., Max
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 2989 27404 257008 111954 18245026
# This chunk is to find the correlation matrix to answer the first assignment question.
# Find numeric columns in motorins
numeric_vars <- sapply(motorins, is.numeric)
# Compute correlation matrix
cm <- cor(motorins[, numeric_vars], use = "pairwise.complete.obs")
# Create a matrix of the output.
cm <- as.matrix(cm)
# Remove diagonal and duplicates. Set diagonal to 0 to ignore self-correlation, which will always return 1.0
diag(cm) <- 0
cm[upper.tri(cm)] <- NA
# I am isolating the variables with correlation absolute value greater than 0.5.
filtered <- which(abs(cm) > 0.5, arr.ind = TRUE)
# 4. List variable pairs and their correlations
result <- data.frame(
var1 = rownames(cm)[filtered[, 1]],
var2 = colnames(cm)[filtered[, 2]],
cor = cm[filtered]
)
result
## var1 var2 cor
## 1 Claims Insured 0.9103478
## 2 Payment Insured 0.9332170
## 3 Payment Claims 0.9954003
# I don't want duplicates, only the unique pairs.
library(reshape2)
cm[upper.tri(cm, diag = TRUE)] <- NA
# This converts to long format, then filters by threshold
df <- melt(cm, na.rm = TRUE)
colnames(df) <- c("var1", "var2", "correlation")
strong_pairs <- subset(df, abs(correlation) > 0.5)
strong_pairs
## var1 var2 correlation
## 34 Claims Insured 0.9103478
## 35 Payment Insured 0.9332170
## 42 Payment Claims 0.9954003
# This starts the second part of the assignment: preprocessing and visualization.
# install.packages(c("tidymodels", "GGally"))
# install.packages("rsample")
# library(tidymodels)
# library(GGally)
# library(rsample)
# I'm starting the process by separating the data into two parts, train and test. It specifies 75% used to train and 25% used to test. We didn't do this in class but the code is straightforward enough for me to understand.
set.seed(2025)
data_split <- initial_split(motorins, prop = 0.75)
# Name the new data sets.
train_data <- training(data_split)
test_data <- testing(data_split)
# Check for new dimensions.
dim(train_data)
## [1] 1636 8
dim(test_data)
## [1] 546 8
# This "recipe" is predicting payment using all the other data
# The 2nd line is marking the variables as categorical
# The 3rd is making the categories binary (yes or no, 0 or 1)
# The 4th removes columns that don't vary (zero variance)
# The 5th line removes values that are highly correlated
# The last line normalizes by centering and scaling
motor_rec <- recipe(Payment ~ ., data = train_data) %>%
update_role(Zone, Make, Kilometres, new_role = "nominal") %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>%
step_corr(all_numeric_predictors(), threshold = 0.8) %>%
step_normalize(all_numeric_predictors())
motor_rec
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 4
## nominal: 3
##
## ── Operations
## • Dummy variables from: all_nominal_predictors()
## • Zero variance filter on: all_predictors()
## • Correlation filter on: all_numeric_predictors()
## • Centering and scaling for: all_numeric_predictors()
library(GGally)
# The "prep" function prepares the new data for the recipe to be applied to in the "bake" function. It makes the actual updates to the data whereas the recipe just provides the plan.
motor_rec_prep <- prep(motor_rec, training = train_data)
# The "bake" function now applies the recipe transforms to both training and test sets. It executes the plan.
train_clean <- bake(motor_rec_prep, new_data = train_data)
test_clean <- bake(motor_rec_prep, new_data = test_data)
# View the outcome.
dim(train_clean)
## [1] 1636 12
dim(test_clean)
## [1] 546 12
glimpse(train_clean)
## Rows: 1,636
## Columns: 12
## $ Kilometres <int> 3, 5, 1, 4, 1, 3, 2, 3, 5,…
## $ Zone <int> 1, 4, 5, 4, 3, 7, 1, 2, 3,…
## $ Bonus <dbl> 0.98914282, 0.48767469, -0…
## $ Make <int> 7, 5, 9, 6, 7, 5, 4, 2, 7,…
## $ Claims <dbl> -0.18113922, -0.26009655, …
## $ Payment <int> 90830, 4525, 481752, 13911…
## $ ZoneLabel_Other.large.cities...surroundings <dbl> -0.4114647, -0.4114647, -0…
## $ ZoneLabel_Small.cities..northern.Sweden. <dbl> -0.4012589, -0.4012589, -0…
## $ ZoneLabel_Small.cities..southern.Sweden. <dbl> -0.4155169, 2.4051702, -0.…
## $ ZoneLabel_Rural.areas..northern.Sweden. <dbl> -0.3961129, -0.3961129, 2.…
## $ ZoneLabel_Rural.areas..southern.Sweden. <dbl> -0.4365405, -0.4365405, -0…
## $ ZoneLabel_Gotland <dbl> -0.3961129, -0.3961129, -0…
# Plot the pairwise relationships among numeric predictors
GGally::ggpairs(
select(train_clean, where(is.numeric)),
title = "Pairwise Relationships of Numeric Features (Post-Processing)"
)

ggplot(train_clean, aes(
x = factor(Kilometres),
y = Payment,
fill = factor(Kilometres)
)) +
geom_boxplot(alpha = 0.7) +
labs(
x = "Kilometres Category",
y = "Scaled Payment",
title = "Post-Processed Payment by Usage"
) +
theme_minimal() +
theme(legend.position = "none")

# Now start for the second dataset
# This dataset we've used and preprocessed before in a previous homework.
data(iris)
dim(iris)
## [1] 150 5
# Summary statistics
summary(iris[, 1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
# Compute means, medians, SDs
sapply(iris[, 1:4], mean)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 5.843333 3.057333 3.758000 1.199333
sapply(iris[, 1:4], median)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 5.80 3.00 4.35 1.30
sapply(iris[, 1:4], sd)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 0.8280661 0.4358663 1.7652982 0.7622377
# a) Scatterplot matrix with species coloring
GGally::ggpairs(iris, aes(color = Species, alpha = 0.5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# b) Scatterplot: Petal.Length vs Petal.Width
ggplot(iris, aes(x = Petal.Width, y = Petal.Length, color = Species)) +
geom_point(size = 2) +
geom_smooth(method = "lm", se = FALSE) +
labs(
title = "Petal Length vs Width by Species",
x = "Petal Width",
y = "Petal Length"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# Preprocess (center, scale, PCA)
preProc <- preProcess(iris[, 1:4], method = c("center", "scale", "pca"), thresh = 0.95)
print(preProc) # shows mean, SD, PCs used
## Created from 150 samples and 4 variables
##
## Pre-processing:
## - centered (4)
## - ignored (0)
## - principal component signal extraction (4)
## - scaled (4)
##
## PCA needed 2 components to capture 95 percent of the variance
iris_pca <- predict(preProc, iris[, 1:4])
summary(iris_pca)
## PC1 PC2
## Min. :-2.7651 Min. :-2.67732
## 1st Qu.:-2.0957 1st Qu.:-0.59205
## Median : 0.4169 Median :-0.01744
## Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 1.3385 3rd Qu.: 0.59649
## Max. : 3.2996 Max. : 2.64521
# Verify scaled mean ≈ 0 and sd ≈ 1
apply(iris_pca, 2, mean)
## PC1 PC2
## -9.399888e-17 -7.736867e-17
apply(iris_pca, 2, sd)
## PC1 PC2
## 1.7083611 0.9560494
# PCA plot with ggplot2 (first 2 components)
iris_pca_df <- data.frame(iris_pca[, 1:2], Species = iris$Species)
# Calculate variance percentages
preProc <- preProcess(iris[, 1:4], method = c("center", "scale", "pca"), thresh = 0.95)
# This line finds the standard deviations of PC’s
stds <- preProc$std
# Compute variance-percentage for the first two components
vars <- round(100 * stds[1:2]^2 / sum(stds^2), 1)
print(vars)
## Sepal.Length Sepal.Width
## 15.0 4.2
ggplot(iris_pca_df, aes(x = PC1, y = PC2, color = Species, shape = Species)) +
geom_point(size = 2.5) +
labs(
title = "PCA on Iris (Scaled, Centered)",
x = paste0("PC1 (", vars[1], "%)"),
y = paste0("PC2 (", vars[2], "%)")
) +
theme_minimal()
