#Loading Packages
library(datasets)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(rpart)
library(class)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2
## ──
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
Chapters 22 to 30 23.2.1 Exercises One downside of the linear model is that it is sensitive to unusual values because the distance incorporates a squared term. Fit a linear model to the simulated data below, and visualise the results. Rerun a few times to generate different simulated datasets. What do you notice about the model?
Running the simulated linear model generated below a few times confirms that the relationship between x and y is generally linear, although there is a considerable amount of scatter around scatter line, with each variation differing with each simulation. The variation is exemplified more where the x values are larger, which suggests that the linear model may not fit well with larger values of x.
sim1a <- tibble(
x = rep(1:10, each = 3),
y = x * 1.5 + 6 + rt(length(x), df = 2)
)
SimulatedModel<- lm(y ~ x, data = sim1a)
#Plotting the model:
SimulatedModel %>% ggplot(aes(x,y)) +
geom_point() +
geom_smooth(method = "lm", se=F)
## `geom_smooth()` using formula = 'y ~ x'
labs(title = "Linear Model Fit to Simulated Data",
x = "X as the independent variable",
y = "y as the dependent variable") +
theme_bw()
## NULL
# Define function to calculate predicted values using linear model
measure_distance <- function(mod, data) {
mod_coeficient <- c(mod[1], mod[2])
predict(lm(y ~ x, data = data), newdata = data, coef = mod_coeficient)
}
# Define function to calculate mean-absolute distance
measure_distance_abs <- function(mod, data) {
diff <- data$y - measure_distance(mod, data)
mean(abs(diff))
}
# Fit linear model using mean-absolute distance
fitted_lm_absolute <- optim(par = c(0, 0), fn = measure_distance_abs, data = sim1a)
# Compare mean-absolute distance model to linear model
summary(fitted_lm_absolute)
## Length Class Mode
## par 2 -none- numeric
## value 1 -none- numeric
## counts 2 -none- numeric
## convergence 1 -none- numeric
## message 0 -none- NULL
summary(lm(y ~ x, data = sim1a))
##
## Call:
## lm(formula = y ~ x, data = sim1a)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.5383 -0.4401 0.0989 0.8711 3.7564
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.8426 0.6836 10.01 9.42e-11 ***
## x 1.3645 0.1102 12.38 7.04e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.733 on 28 degrees of freedom
## Multiple R-squared: 0.8456, Adjusted R-squared: 0.8401
## F-statistic: 153.4 on 1 and 28 DF, p-value: 7.041e-13
# Fit linear model using mean-absolute distance
fitted_lm_absolute <- optim(par = c(0, 0), fn = measure_distance_abs, data = sim1a)
# Compare mean-absolute distance model to linear model
summary(fitted_lm_absolute)
## Length Class Mode
## par 2 -none- numeric
## value 1 -none- numeric
## counts 2 -none- numeric
## convergence 1 -none- numeric
## message 0 -none- NULL
summary(lm(y ~ x, data = sim1a))
##
## Call:
## lm(formula = y ~ x, data = sim1a)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.5383 -0.4401 0.0989 0.8711 3.7564
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.8426 0.6836 10.01 9.42e-11 ***
## x 1.3645 0.1102 12.38 7.04e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.733 on 28 degrees of freedom
## Multiple R-squared: 0.8456, Adjusted R-squared: 0.8401
## F-statistic: 153.4 on 1 and 28 DF, p-value: 7.041e-13
Optimizing a three-parameter model can be problematic due to the existence of multiple local optima, which cause the optimization algorithm to produce different parameter estimates depending on the starting values. This can complicate the selection of the best-fitting model and potentially lead to over-fitting. Therefore, it is crucial to try various starting parameter values and assess the goodness of fit of different models to ensure optimal model selection.
model1 <- function(a, data) {
a[1] + data$x * a[2] + a[3]
}
23.4.5 Exercises 3. Using the basic principles, convert the formulas in the following two models into functions. (Hint: start by converting the categorical variable into 0-1 variables.)
library(modelr)
mod1 <- lm(y ~ x1 + x2, data = sim3)
model1 <- function(data, x1, x2) {
# Convert categorical variable into 0-1 variables
x1_0 <- ifelse(data$x1 == 0, 1, 0)
x1_1 <- ifelse(data$x1 == 1, 1, 0)
x2_0 <- ifelse(data$x2 == 0, 1, 0)
x2_1 <- ifelse(data$x2 == 1, 1, 0)
# Compute model output
output <- x1_0 * x2_0 * beta0 + x1_1 * x2_0 * beta1 + x1_0 * x2_1 * beta2 + x1_1 * x2_1 * beta3
return(output)
}
#Note: the coefficients beta0, beta1, beta2, and beta3 are obtained from the lm() function.
mod2 <- lm(y ~ x1 * x2, data = sim3)
model2 <- function(data, x1, x2) {
# Convert categorical variable into 0-1 variables
x1_0 <- ifelse(data$x1 == 0, 1, 0)
x1_1 <- ifelse(data$x1 == 1, 1, 0)
x2_0 <- ifelse(data$x2 == 0, 1, 0)
x2_1 <- ifelse(data$x2 == 1, 1, 0)
# Compute model output
output <- beta0 + beta1 * x1_1 + beta2 * x2_1 + beta3 * x1_1 * x2_1 + beta4 * x1_1 * x2_0 + beta5 * x1_0 * x2_1
return(output)
}
#Note: the coefficients beta0, beta1, beta2, beta3, beta4, and beta5 are obtained from the lm() function.
#Reference from both R4DS and CHAT GPT
27.4.7 Exercises 1. Add a section that explores how diamond sizes vary by cut, colour, and clarity. Assume you’re writing a report for someone who doesn’t know R, and instead of setting echo = FALSE on each chunk, set a global option.
knitr::opts_chunk$set(echo = TRUE)
options("knitr.duplicate.label.warning" = FALSE)
options("knitr.chunk.verbose" = FALSE)
library(ggplot2)
data(diamonds)
ggplot(diamonds, aes(x = carat, y = price, color = cut)) +
geom_point() +
scale_color_manual(values = c("#E69F00", "#56B4E9", "#F0E442", "#009E73", "#D55E00")) +
ggtitle("Diamond Carat Weight vs. Price by Cut") +
xlab("Carat Weight") +
ylab("Price")
#Reference from both R4DS and CHAT GPT
ggplot(diamonds, aes(x = carat, y = price, color = clarity)) +
geom_point() +
ggtitle("Diamond Carat Weight vs. Price by Clarity") +
xlab("Carat Weight") +
ylab("Price")
#Reference from both R4DS and CHAT GPT
Decision Tree and kNN classification
# Load the Iris dataset and split into training and testing sets
data(iris)
set.seed(500) # for reproducibility
trainIndex <- createDataPartition(iris$Species, p=0.75, list=FALSE)
training_set <- iris[trainIndex,]
test_set <- iris[-trainIndex,]
Decision Tree classification: Method 1: Complexity Parameter method
# Decision Tree classification using Complexity Parameter
dtmodel<- rpart(Species ~ ., data = training_set, method = "class", control = rpart.control(cp=0.0001))
#plot the fitted model
plot(dtmodel, uniform= T, main="Decision Tree classification using Complexity Parameter", margin=c(0,0,0.2,0.1))
text(dtmodel, use.n=T, all=T, cex=0.6, col="blue")
Method 2: Cost Complexity Prunning
# Decision Tree classification using cost complexity pruning
dt_model <- train(Species ~ ., data=training_set, method="rpart",
trControl=trainControl(method="cv", number=10),
tuneGrid=data.frame(cp=seq(0,0.0001,by=0.01)),
metric="Accuracy")
plot(dt_model$finalModel, main="Decision Tree with Cost Complexity Pruning", mar = c(0,0,.2,.2))
text(dt_model$finalModel, use.n=T, all = T, cex= 0.8, col="brown")
Print the model
print(dtmodel, digits = 2)
## n= 114
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 114 76 setosa (0.333 0.333 0.333)
## 2) Petal.Length< 2.4 38 0 setosa (1.000 0.000 0.000) *
## 3) Petal.Length>=2.4 76 38 versicolor (0.000 0.500 0.500)
## 6) Petal.Width< 1.8 39 2 versicolor (0.000 0.949 0.051) *
## 7) Petal.Width>=1.8 37 1 virginica (0.000 0.027 0.973) *
print(dt_model, digits = 2)
## CART
##
## 114 samples
## 4 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 102, 102, 104, 102, 102, 104, ...
## Resampling results:
##
## Accuracy Kappa
## 0.95 0.92
##
## Tuning parameter 'cp' was held constant at a value of 0
# kNN classification using cross-validation to determine optimal k
knn_model <- train(Species ~ ., data=training_set, method="knn",
trControl=trainControl(method="cv", number=10),
tuneGrid=data.frame(k=1:20),
metric="Accuracy")
plot(knn_model)
Predict and Confusion
#Make predictions on the test-data
IrisPredicted<- dtmodel %>% predict(test_set, type = "class")
head(IrisPredicted)
## 3 7 16 20 21 23
## setosa setosa setosa setosa setosa setosa
## Levels: setosa versicolor virginica
#confusion Matrix
confusionMatrix(factor(test_set$Species), IrisPredicted)
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 12 0 0
## versicolor 0 12 0
## virginica 0 3 9
##
## Overall Statistics
##
## Accuracy : 0.9167
## 95% CI : (0.7753, 0.9825)
## No Information Rate : 0.4167
## P-Value [Acc > NIR] : 4.286e-10
##
## Kappa : 0.875
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.8000 1.0000
## Specificity 1.0000 1.0000 0.8889
## Pos Pred Value 1.0000 1.0000 0.7500
## Neg Pred Value 1.0000 0.8750 1.0000
## Prevalence 0.3333 0.4167 0.2500
## Detection Rate 0.3333 0.3333 0.2500
## Detection Prevalence 0.3333 0.3333 0.3333
## Balanced Accuracy 1.0000 0.9000 0.9444