# load the tidyverse package
library(tidyverse)
ARES4011 Submission
Modeling Blotch Time
#Clearing the workspace
rm(list = ls())
library(tidyverse)
library(caret)
library(mlbench)
# Add Data
<- read_excel("D:/ARES40011 Rsrch Methods & Data Analysis/Summative/sharks.xlsx")
sharks <- data.frame(sharks)
sharks
# Set seed for reproducibility
set.seed(234)
# Create training samples
<- sharks$blotch %>%
training.samples createDataPartition(p = 0.8, list = FALSE)
#Assign Partitioned data to dataframes
<-sharks[training.samples, ]
train.data<-sharks[-training.samples, ] test.data
Linear Regression
From our previous analysis, we know there is a strong correlation between depth and bloch time. Let’s fit the model and check its effectiveness
Also, air and meta were found to have correlated significantly so have been excluded from all models.
<-lm(blotch~depth, data=train.data)
modelsummary(model)
Call:
lm(formula = blotch ~ depth, data = train.data)
Residuals:
Min 1Q Median 3Q Max
-2.81044 -0.65674 -0.04245 0.57193 2.86632
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 10.81343 1.24859 8.66 <2e-16 ***
depth 0.48471 0.02487 19.49 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1 on 398 degrees of freedom
Multiple R-squared: 0.4882, Adjusted R-squared: 0.487
F-statistic: 379.7 on 1 and 398 DF, p-value: < 2.2e-16
This is pretty good! lets check its prediction capabilities:
# Create preditctions from model on test data
<-predict(model, test.data)
predictions
#Assess Models Accuracy
#MSE
<-mean((test.data$blotch-predictions)^2)
mse#Produce output including label
print(paste("LM1 mse:", mse))
[1] "LM1 mse: 1.00886315360145"
#Root Mean Squared Error
<- sqrt(mse)
rmse print(paste("LM1 RMSE:", rmse))
[1] "LM1 RMSE: 1.00442180064027"
# Calculate R-squared
<- sum((test.data$blotch - predictions)^2)
rss <- sum((test.data$blotch - mean(test.data$blotch))^2)
tss <- 1 - (rss/tss)
r_squared print(paste("LM1 R-squared:", r_squared))
[1] "LM1 R-squared: 0.57775730819532"
This is only explaing approx 57% of the varience so is not that helpful. Lets see if it is improved by the inclusion of other predictors
Multiple Linear Regression
<-lm(blotch~depth+sex+BPM+weight+weight+length+water, data=train.data)
glm_modelsummary(glm_model)
Call:
lm(formula = blotch ~ depth + sex + BPM + weight + weight + length +
water, data = train.data)
Residuals:
Min 1Q Median 3Q Max
-2.89728 -0.64890 -0.03427 0.61504 2.96668
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 10.246261 1.613731 6.349 5.97e-10 ***
depth 0.486297 0.024793 19.614 < 2e-16 ***
sexMale 0.286089 0.099858 2.865 0.00439 **
BPM -0.003430 0.003517 -0.975 0.32997
weight 0.003345 0.003729 0.897 0.37028
length 0.001402 0.001074 1.306 0.19235
water 0.010075 0.029618 0.340 0.73392
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.9919 on 393 degrees of freedom
Multiple R-squared: 0.503, Adjusted R-squared: 0.4954
F-statistic: 66.28 on 6 and 393 DF, p-value: < 2.2e-16
This suggests that sex is a significant predictor, so lets rebuilt a more specific model and re-test
<- lm(blotch ~ depth + sex, data = train.data)
glm_model2 summary(glm_model2)
Call:
lm(formula = blotch ~ depth + sex, data = train.data)
Residuals:
Min 1Q Median 3Q Max
-2.93974 -0.63739 -0.02135 0.56798 2.96795
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 10.69327 1.23865 8.633 < 2e-16 ***
depth 0.48411 0.02466 19.629 < 2e-16 ***
sexMale 0.27946 0.09944 2.810 0.00519 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.9916 on 397 degrees of freedom
Multiple R-squared: 0.4982, Adjusted R-squared: 0.4957
F-statistic: 197.1 on 2 and 397 DF, p-value: < 2.2e-16
# Create preditctions from model on test data
<-predict(glm_model2, test.data)
predictions2
#Assess Models Accuracy
#MSE
<-mean((test.data$blotch-predictions2)^2)
mse#Produce output including label
print(paste("GLM2 mse:", mse))
[1] "GLM2 mse: 0.971084389631148"
#Root Mean Squared Error
<- sqrt(mse)
rmse print(paste("GLM2 RMSE:", rmse))
[1] "GLM2 RMSE: 0.985436141833223"
# Calculate R-squared
<- sum((test.data$blotch - predictions2)^2)
rss <- sum((test.data$blotch - mean(test.data$blotch))^2)
tss <- 1 - (rss/tss)
r_squared print(paste("GLM2 R-squared:", r_squared))
[1] "GLM2 R-squared: 0.593568973964783"
Whilst this does improve the models accuracy, it still onnly explains 59% of the variation. From our earlier visual checks of teh data, we know that there were no polynomial relationships so we have reached the limit of the predtions possible with GLMs.
Modeling with sex split data
As identified in the ealier analysis, there is a significant difference between male and female blotch time. It is therefore interesting to see if predictions can be applied to one sex.
# Split the data by sex
<- sharks %>% filter(sex == "Female")
sharks_female <- sharks %>% filter(sex == "Male") sharks_male
# Set seed for reproducibility
set.seed(234)
# Create training samples
<- sharks_female$blotch %>%
training.samples_female createDataPartition(p = 0.8, list = FALSE)
#Assign Partitioned data to dataframes
<-sharks_female[training.samples_female, ]
train.data_female<-sharks_female[-training.samples_female, ] test.data_female
<-lm(blotch~depth, data=train.data_female)
model_fe_lmsummary(model_fe_lm)
Call:
lm(formula = blotch ~ depth, data = train.data_female)
Residuals:
Min 1Q Median 3Q Max
-2.48167 -0.55854 0.04126 0.59489 2.97202
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 10.58708 1.66676 6.352 1.53e-09 ***
depth 0.48619 0.03329 14.606 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.9567 on 190 degrees of freedom
Multiple R-squared: 0.5289, Adjusted R-squared: 0.5264
F-statistic: 213.3 on 1 and 190 DF, p-value: < 2.2e-16
This is not very different from teh one for data including both sexs, but lets check its prediction capabilities:
# Create preditctions from model on test data
<- predict(model_fe_lm, test.data_female)
predictions_fe_lm
# Assess Model's Accuracy
# MSE
<- mean((test.data_female$blotch - predictions_fe_lm)^2)
mse_Fe # Produce output including label
print(paste("LM_Fe mse:", mse_Fe))
[1] "LM_Fe mse: 0.954896236658408"
# Root Mean Squared Error
<- sqrt(mse_Fe)
rmse_Fe print(paste("LM_Fe RMSE:", rmse_Fe))
[1] "LM_Fe RMSE: 0.977187922898359"
# Calculate R-squared
<- sum((test.data_female$blotch - predictions_fe_lm)^2)
rss_Lm_Fe <- sum((test.data_female$blotch - mean(test.data_female$blotch))^2)
tss_Lm_Fe <- 1 - (rss_Lm_Fe / tss_Lm_Fe)
r_squared_lm_Fe print(paste("LM_Fe R-squared:", r_squared_lm_Fe))
[1] "LM_Fe R-squared: 0.515810244670255"
This is only explaing approx 51% of the varience so is not that helpful and is owrse than our all data model. Lets see if it is improved by the inclusion of other predictors
Multiple Linear Regression
# Create linear model
<- lm(blotch ~ BPM + weight + length + water, data = train.data_female)
glm_model_fe
# Summarize the model
summary(glm_model_fe)
Call:
lm(formula = blotch ~ BPM + weight + length + water, data = train.data_female)
Residuals:
Min 1Q Median 3Q Max
-4.1677 -0.8189 -0.1133 0.9625 3.8311
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 34.8508524 1.9044964 18.299 <2e-16 ***
BPM -0.0077815 0.0073821 -1.054 0.293
weight 0.0012468 0.0074111 0.168 0.867
length 0.0005759 0.0021346 0.270 0.788
water 0.0403689 0.0592275 0.682 0.496
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.399 on 187 degrees of freedom
Multiple R-squared: 0.00886, Adjusted R-squared: -0.01234
F-statistic: 0.4179 on 4 and 187 DF, p-value: 0.7956
This suggests no other significant predictors for the female samples.
# Set seed for reproducibility
set.seed(234)
# Create training samples
<- sharks_male$blotch %>%
training.samples_male createDataPartition(p = 0.8, list = FALSE)
#Assign Partitioned data to dataframes
<-sharks_male[training.samples_male, ]
train.data_male<-sharks_male[-training.samples_male, ] test.data_male
<-lm(blotch~depth, data=train.data_male)
model_ma_lmsummary(model_ma_lm)
Call:
lm(formula = blotch ~ depth, data = train.data_male)
Residuals:
Min 1Q Median 3Q Max
-2.95147 -0.65302 -0.04974 0.57094 2.59508
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.46885 1.75637 5.391 1.88e-07 ***
depth 0.51437 0.03495 14.716 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.019 on 210 degrees of freedom
Multiple R-squared: 0.5077, Adjusted R-squared: 0.5053
F-statistic: 216.6 on 1 and 210 DF, p-value: < 2.2e-16
And the predictive test
# Create preditctions from model on test data
<- predict(model_ma_lm, test.data_male)
predictions_ma_lm
# Assess Model's Accuracy
# MSE
<- mean((test.data_male$blotch - predictions_ma_lm)^2)
mse_ma # Produce output including label
print(paste("LM_ma mse:", mse_ma))
[1] "LM_ma mse: 1.01657017672931"
# Root Mean Squared Error
<- sqrt(mse_ma)
rmse_ma print(paste("LM_ma RMSE:", rmse_ma))
[1] "LM_ma RMSE: 1.00825104846428"
# Calculate R-squared
<- sum((test.data_male$blotch - predictions_ma_lm)^2)
rss_Lm_ma <- sum((test.data_male$blotch - mean(test.data_male$blotch))^2)
tss_Lm_ma <- 1 - (rss_Lm_ma / tss_Lm_ma)
r_squared_lm_ma print(paste("LM_ma R-squared:", r_squared_lm_ma))
[1] "LM_ma R-squared: 0.469320413660358"
This is less accurate than our previous models. Lets see if there are any other significant factors:
# Create linear model
<- lm(blotch ~ depth + BPM + weight + length + water, data = train.data_male)
glm_model_ma
# Summarize the model
summary(glm_model_ma)
Call:
lm(formula = blotch ~ depth + BPM + weight + length + water,
data = train.data_male)
Residuals:
Min 1Q Median 3Q Max
-2.94887 -0.64104 -0.06228 0.58686 2.63736
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 10.3680604 2.3573214 4.398 1.75e-05 ***
depth 0.5132221 0.0356102 14.412 < 2e-16 ***
BPM -0.0019404 0.0049489 -0.392 0.695
weight 0.0002594 0.0055450 0.047 0.963
length 0.0004878 0.0015845 0.308 0.759
water -0.0299880 0.0446796 -0.671 0.503
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.026 on 206 degrees of freedom
Multiple R-squared: 0.5096, Adjusted R-squared: 0.4977
F-statistic: 42.82 on 5 and 206 DF, p-value: < 2.2e-16
This suggests no other significant predictors for the male samples either.
General Model Playing
Gamma GML
<-glm(blotch~depth+sex,
gammalfamily=Gamma(link=log),
data=train.data)
# Create preditctions from model on test data
<-predict(gammal, test.data)
predictions3
#Assess Models Accuracy
#MSE
<-mean((test.data$blotch-predictions3)^2)
mse#Produce output including label
print(paste("GAMMA GLM mse:", mse))
[1] "GAMMA GLM mse: 999.214085876252"
#Root Mean Squared Error
<- sqrt(mse)
rmse print(paste("GAMMA GLM RMSE:", rmse))
[1] "GAMMA GLM RMSE: 31.6103477658227"
# Calculate R-squared
<- sum((test.data$blotch - predictions3)^2)
rss <- sum((test.data$blotch - mean(test.data$blotch))^2)
tss <- 1 - (rss/tss)
r_squared print(paste("GAMMA GLM R-squared:", r_squared))
[1] "GAMMA GLM R-squared: -417.204236920936"
This is clearly inappropriate!
Decision Trees
# Load necessary library
library(rpart)
# Fit the decision tree model
<- rpart(blotch ~ depth + sex + BPM + weight + length + water, data = train.data)
DT_model
# Summarize the model
summary(DT_model)
Call:
rpart(formula = blotch ~ depth + sex + BPM + weight + length +
water, data = train.data)
n= 400
CP nsplit rel error xerror xstd
1 0.30682387 0 1.0000000 1.0091643 0.07451584
2 0.09293024 1 0.6931761 0.7547603 0.05704714
3 0.08655757 2 0.6002459 0.6957056 0.04812921
4 0.01948587 3 0.5136883 0.5744040 0.04157488
5 0.01716320 4 0.4942024 0.5633858 0.03977897
6 0.01054044 5 0.4770392 0.5441942 0.03931566
7 0.01000000 6 0.4664988 0.5350905 0.03698648
Variable importance
depth length BPM weight water
92 4 2 2 1
Node number 1: 400 observations, complexity param=0.3068239
mean=35.12411, MSE=1.944723
left son=2 (221 obs) right son=3 (179 obs)
Primary splits:
depth < 50.39554 to the left, improve=0.306823900, (0 missing)
sex splits as LR, improve=0.011225570, (0 missing)
weight < 88.7134 to the left, improve=0.010281100, (0 missing)
BPM < 137.5 to the right, improve=0.007445887, (0 missing)
water < 20.9844 to the left, improve=0.005938598, (0 missing)
Surrogate splits:
length < 151.2442 to the right, agree=0.573, adj=0.045, (0 split)
BPM < 123.5 to the right, agree=0.562, adj=0.022, (0 split)
water < 25.79412 to the left, agree=0.560, adj=0.017, (0 split)
weight < 110.581 to the left, agree=0.557, adj=0.011, (0 split)
Node number 2: 221 observations, complexity param=0.08655757
mean=34.42892, MSE=1.31364
left son=4 (77 obs) right son=5 (144 obs)
Primary splits:
depth < 48.35314 to the left, improve=0.23192850, (0 missing)
BPM < 124.5 to the left, improve=0.02452245, (0 missing)
water < 25.35719 to the left, improve=0.01914465, (0 missing)
sex splits as LR, improve=0.01570431, (0 missing)
weight < 74.06402 to the right, improve=0.01363231, (0 missing)
Surrogate splits:
length < 134.2956 to the left, agree=0.665, adj=0.039, (0 split)
water < 20.14037 to the left, agree=0.661, adj=0.026, (0 split)
weight < 110.363 to the right, agree=0.656, adj=0.013, (0 split)
Node number 3: 179 observations, complexity param=0.09293024
mean=35.98242, MSE=1.390501
left son=6 (123 obs) right son=7 (56 obs)
Primary splits:
depth < 52.37234 to the left, improve=0.29043590, (0 missing)
length < 181.6658 to the left, improve=0.02798548, (0 missing)
weight < 107.0718 to the left, improve=0.01740607, (0 missing)
BPM < 127.5 to the right, improve=0.01187385, (0 missing)
sex splits as LR, improve=0.00782871, (0 missing)
Surrogate splits:
BPM < 119.5 to the right, agree=0.698, adj=0.036, (0 split)
weight < 110.7731 to the left, agree=0.698, adj=0.036, (0 split)
length < 273.7339 to the left, agree=0.698, adj=0.036, (0 split)
Node number 4: 77 observations, complexity param=0.0171632
mean=33.67409, MSE=0.8852428
left son=8 (15 obs) right son=9 (62 obs)
Primary splits:
depth < 46.56121 to the left, improve=0.19586770, (0 missing)
BPM < 127.5 to the left, improve=0.15064330, (0 missing)
water < 21.0222 to the left, improve=0.09737779, (0 missing)
sex splits as LR, improve=0.07784480, (0 missing)
weight < 90.9203 to the right, improve=0.04637677, (0 missing)
Surrogate splits:
BPM < 122.5 to the left, agree=0.818, adj=0.067, (0 split)
weight < 66.26287 to the left, agree=0.818, adj=0.067, (0 split)
Node number 5: 144 observations, complexity param=0.01054044
mean=34.83255, MSE=1.075129
left son=10 (27 obs) right son=11 (117 obs)
Primary splits:
depth < 48.98858 to the left, improve=0.05296068, (0 missing)
water < 20.72111 to the right, improve=0.04888154, (0 missing)
weight < 92.73019 to the left, improve=0.03249283, (0 missing)
length < 179.066 to the left, improve=0.02505633, (0 missing)
BPM < 153 to the right, improve=0.02232216, (0 missing)
Surrogate splits:
length < 290.0006 to the right, agree=0.826, adj=0.074, (0 split)
Node number 6: 123 observations
mean=35.55362, MSE=0.8542041
Node number 7: 56 observations, complexity param=0.01948587
mean=36.92424, MSE=1.277557
left son=14 (47 obs) right son=15 (9 obs)
Primary splits:
depth < 54.16999 to the left, improve=0.21186980, (0 missing)
length < 177.0123 to the left, improve=0.06709445, (0 missing)
water < 21.27713 to the left, improve=0.05156503, (0 missing)
BPM < 142 to the right, improve=0.02526993, (0 missing)
weight < 100.1431 to the left, improve=0.01819176, (0 missing)
Node number 8: 15 observations
mean=32.82752, MSE=0.6336963
Node number 9: 62 observations
mean=33.8789, MSE=0.7307611
Node number 10: 27 observations
mean=34.33582, MSE=1.055709
Node number 11: 117 observations
mean=34.94718, MSE=1.009531
Node number 14: 47 observations
mean=36.69658, MSE=0.9547175
Node number 15: 9 observations
mean=38.11316, MSE=1.279294
# Fit the refined decision tree model
<- rpart(blotch ~ depth + length, data = train.data)
DT_model1
# Create preditctions from model on test data
<-predict(DT_model1, test.data)
predictions4
#Assess Models Accuracy
#MSE
<-mean((test.data$blotch-predictions4)^2)
mse#Produce output including label
print(paste("DT1 mse:", mse))
[1] "DT1 mse: 0.973644768541162"
#Root Mean Squared Error
<- sqrt(mse)
rmse print(paste("DT1 RMSE:", rmse))
[1] "DT1 RMSE: 0.986734396147799"
# Calculate R-squared
<- sum((test.data$blotch - predictions4)^2)
rss <- sum((test.data$blotch - mean(test.data$blotch))^2)
tss <- 1 - (rss/tss)
r_squared print(paste("DT1 R-squared:", r_squared))
[1] "DT1 R-squared: 0.592497370468169"
This is no more accurate than the GLM.