The Fish Market dataset offers a valuable resource for examining the relationship between the physical attributes of various fish species and their corresponding weights. Accurate prediction of fish weight based on non-invasive measurements is critical for the fisheries industry, affecting everything from stock management to ecological studies and commercial pricing strategies. However, capturing the complexities of biological variations across different species using standard linear models can be challenging due to the nonlinear nature of biological data.
This study seeks to address these challenges by employing a combination of linear, Box-Cox transformation, logarithimic transformation and weighted least square techniques to find out the best predictive models. By doing so, it aims to not only enhance the predictive accuracy but also provide insights into the factors that most significantly influence fish weight. These insights are crucial for stakeholders in fisheries science, providing a basis for more informed decision-making and contributing to the broader field of aquatic life studies.
Dataset sourced from - Fish market. (n.d.). Retrieved June 13, 2024, from https://www.kaggle.com/datasets/vipullrathod/fish-market
fish_df <- read.csv("Fish.csv")
head(fish_df)
## Species Weight Length1 Length2 Length3 Height Width
## 1 Bream 242 23.2 25.4 30.0 11.5200 4.0200
## 2 Bream 290 24.0 26.3 31.2 12.4800 4.3056
## 3 Bream 340 23.9 26.5 31.1 12.3778 4.6961
## 4 Bream 363 26.3 29.0 33.5 12.7300 4.4555
## 5 Bream 430 26.5 29.0 34.0 12.4440 5.1340
## 6 Bream 450 26.8 29.7 34.7 13.6024 4.9274
#summary of the data
summary(fish_df)
## Species Weight Length1 Length2
## Length:159 Min. : 0.0 Min. : 7.50 Min. : 8.40
## Class :character 1st Qu.: 120.0 1st Qu.:19.05 1st Qu.:21.00
## Mode :character Median : 273.0 Median :25.20 Median :27.30
## Mean : 398.3 Mean :26.25 Mean :28.42
## 3rd Qu.: 650.0 3rd Qu.:32.70 3rd Qu.:35.50
## Max. :1650.0 Max. :59.00 Max. :63.40
## Length3 Height Width
## Min. : 8.80 Min. : 1.728 Min. :1.048
## 1st Qu.:23.15 1st Qu.: 5.945 1st Qu.:3.386
## Median :29.40 Median : 7.786 Median :4.248
## Mean :31.23 Mean : 8.971 Mean :4.417
## 3rd Qu.:39.65 3rd Qu.:12.366 3rd Qu.:5.585
## Max. :68.00 Max. :18.957 Max. :8.142
# Structure of the data
str(fish_df)
## 'data.frame': 159 obs. of 7 variables:
## $ Species: chr "Bream" "Bream" "Bream" "Bream" ...
## $ Weight : num 242 290 340 363 430 450 500 390 450 500 ...
## $ Length1: num 23.2 24 23.9 26.3 26.5 26.8 26.8 27.6 27.6 28.5 ...
## $ Length2: num 25.4 26.3 26.5 29 29 29.7 29.7 30 30 30.7 ...
## $ Length3: num 30 31.2 31.1 33.5 34 34.7 34.5 35 35.1 36.2 ...
## $ Height : num 11.5 12.5 12.4 12.7 12.4 ...
## $ Width : num 4.02 4.31 4.7 4.46 5.13 ...
#finding null values
sum(is.na(fish_df))
## [1] 0
#Detecting special values
is.special <- function(x){ if (is.numeric(x)) (is.infinite(x) | is.nan(x))}
sapply(fish_df, function(x) sum( is.special(x)))
## Species Weight Length1 Length2 Length3 Height Width
## 0 0 0 0 0 0 0
# Convert data from wide to long format
fish_long <- pivot_longer(fish_df, cols = c(Length1, Length2, Length3), names_to = "LengthType", values_to = "Value")
# Create a box plot comparing Length1, Length2, and Length3 across Species with consistent colors for each species
ggplot(fish_long, aes(x = Species, y = Value, fill = Species)) +
geom_boxplot() +
facet_wrap(~ LengthType, scales = "free_y") +
scale_fill_brewer(palette = "Set3", name = "Species") + # Using a Brewer palette for distinct colors per species
labs(title = "Comparison of Fish Lengths by Species",
x = "Species",
y = "Length (cm)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Adjust text angle for better readability
ggplot(fish_df, aes(x = Species, y = Weight, color = Species)) +
geom_jitter(width = 0.2, alpha = 0.6, size = 2) +
labs(title = "Jitter Plot of Weight by Species", x = "Species", y = "Weight (g)") +
theme_minimal() +
scale_color_brewer(palette = "Set1")
ggplot(fish_df, aes(x = Length3, y = Weight, color = Species)) +
geom_point() +
labs(title = "Scatter plot of Weight vs. Length3 by Species", x = "Length3 (cm)", y = "Weight (g)") +
theme_minimal() +
scale_color_brewer(palette="Set1")
check_model_adequacy <- function(model) {
# Summary of the model
print(summary(model))
# Diagnostic Plots
par(mfrow = c(2, 2))
plot(model)
# ANOVA table
anova_results <- anova(model)
print(anova_results)
# Non-Constant Variance Test
ncv_test <- ncvTest(model)
print(ncv_test)
# Conditionally performing Durbin-Watson test if model is not weighted
if (is.null(weights(model))) {
dw_test <- dwtest(model)
print(dw_test)
} else {
cat("Skipping Durbin-Watson test: weighted regressions are not supported.\n")
}
# Shapiro-Wilk Normality Test of Residuals
shapiro_test <- shapiro.test(residuals(model))
print(shapiro_test)
# Conditional Component + Residual Plot
if (!any(grepl(":", names(coef(model))))) { # Check if interaction terms exist
crPlots(model)
} else {
cat("Skipping C+R plots due to interaction terms. Using termplot instead.\n")
termplot(model, partial.resid = TRUE, se = TRUE)
}
# Breusch-Pagan test
bp_test <- bptest(model)
print(bp_test)
# Variance Inflation Factor
vif_results <- vif(model)
print(vif_results)
# Outlier Test
outliers <- outlierTest(model)
if (is.null(outliers)) {
cat("No significant outliers detected.\n")
} else {
print(outliers)
}
# Influence Measures
influence_measures <- influence.measures(model)
print(summary(influence_measures))
par(mfrow = c(1, 1))
}
# Fitting the linear model
lm_model <- lm(Weight ~ Length1 + Length2 + Length3 + Height + Width + factor(Species), data = fish_df)
Adequacy Check
check_model_adequacy(lm_model)
##
## Call:
## lm(formula = Weight ~ Length1 + Length2 + Length3 + Height +
## Width + factor(Species), data = fish_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -213.18 -53.19 -12.62 36.49 420.82
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -918.3321 127.0831 -7.226 2.5e-11 ***
## Length1 -80.3030 36.2785 -2.214 0.028403 *
## Length2 79.8886 45.7180 1.747 0.082653 .
## Length3 32.5354 29.3002 1.110 0.268633
## Height 5.2510 13.0560 0.402 0.688128
## Width -0.5154 23.9130 -0.022 0.982832
## factor(Species)Parkki 164.7227 75.6995 2.176 0.031152 *
## factor(Species)Perch 137.9489 120.3135 1.147 0.253419
## factor(Species)Pike -208.4294 135.3064 -1.540 0.125607
## factor(Species)Roach 103.0400 91.3084 1.128 0.260954
## factor(Species)Smelt 446.0733 119.4303 3.735 0.000268 ***
## factor(Species)Whitefish 93.8742 96.6580 0.971 0.333045
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 93.83 on 147 degrees of freedom
## Multiple R-squared: 0.9361, Adjusted R-squared: 0.9313
## F-statistic: 195.7 on 11 and 147 DF, p-value: < 2.2e-16
## Analysis of Variance Table
##
## Response: Weight
## Df Sum Sq Mean Sq F value Pr(>F)
## Length1 1 16978060 16978060 1928.5523 < 2.2e-16 ***
## Length2 1 235134 235134 26.7091 7.589e-07 ***
## Length3 1 74109 74109 8.4181 0.004287 **
## Height 1 619028 619028 70.3160 3.787e-14 ***
## Width 1 18474 18474 2.0985 0.149571
## factor(Species) 6 1028534 171422 19.4720 < 2.2e-16 ***
## Residuals 147 1294118 8804
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 0.03271351, Df = 1, p = 0.85647
##
## Durbin-Watson test
##
## data: model
## DW = 0.97285, p-value = 1.826e-13
## alternative hypothesis: true autocorrelation is greater than 0
##
##
## Shapiro-Wilk normality test
##
## data: residuals(model)
## W = 0.93335, p-value = 9.145e-07
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 16.733, df = 11, p-value = 0.116
##
## GVIF Df GVIF^(1/(2*Df))
## Length1 2360.42508 1 48.584206
## Length2 4307.91811 1 65.634732
## Length3 2076.93715 1 45.573426
## Height 56.20370 1 7.496913
## Width 29.16651 1 5.400602
## factor(Species) 1509.77571 6 1.840388
## rstudent unadjusted p-value Bonferroni p
## 73 5.041405 1.3492e-06 0.00021452
## 143 3.726958 2.7645e-04 0.04395500
## Potentially influential observations of
## lm(formula = Weight ~ Length1 + Length2 + Length3 + Height + Width + factor(Species), data = fish_df) :
##
## dfb.1_ dfb.Lng1 dfb.Lng2 dfb.Lng3 dfb.Hght dfb.Wdth dfb.fctr(Spcs)Prk
## 14 -0.14 -0.19 0.22 -0.13 0.24 -0.06 0.13
## 34 0.03 -0.05 0.10 -0.11 0.07 -0.02 -0.06
## 36 0.16 -0.05 0.10 -0.10 -0.04 0.05 -0.13
## 56 0.02 0.06 -0.04 -0.01 -0.02 0.06 -0.02
## 59 0.02 0.00 0.00 -0.01 -0.01 0.05 -0.01
## 61 -0.09 0.25 -0.20 0.04 0.09 -0.20 0.05
## 73 0.61 0.26 -0.03 -0.30 0.08 0.02 -0.49
## 74 0.01 0.14 -0.19 0.13 0.08 -0.20 0.03
## 112 0.09 0.04 -0.06 -0.03 -0.14 0.70 -0.05
## 130 0.00 0.00 0.01 -0.01 0.00 0.00 0.00
## 131 0.09 -0.02 0.12 -0.14 0.00 -0.06 -0.10
## 133 0.02 0.01 0.01 -0.03 0.03 0.01 -0.02
## 143 -0.23 -0.30 0.17 0.26 -0.54 -0.68 0.18
## 144 -0.19 -0.25 0.14 0.22 -0.44 -0.56 0.15
## 145 -0.26 -0.41 0.09 0.41 -0.59 0.00 0.27
## dfb.fctr(Spcs)Prc dfb.fctr(Spcs)Pk dfb.f(S)R dfb.f(S)S dfb.f(S)W dffit
## 14 0.12 0.23 0.22 0.19 0.15 -0.54
## 34 -0.04 0.00 -0.01 -0.02 -0.03 0.16
## 36 -0.12 -0.05 0.01 -0.13 -0.10 0.62
## 56 -0.03 -0.02 -0.03 -0.02 -0.16 -0.35
## 59 -0.02 0.00 -0.02 -0.01 0.02 0.11
## 61 0.08 -0.02 0.07 0.05 0.41 0.89_*
## 73 -0.20 0.00 -0.34 -0.47 -0.19 1.46_*
## 74 0.16 0.10 0.06 0.01 0.14 0.62
## 112 -0.10 0.12 -0.12 -0.03 -0.16 0.86_*
## 130 -0.01 -0.01 0.00 0.00 0.00 -0.01
## 131 -0.11 -0.13 -0.07 -0.09 -0.08 -0.22
## 133 -0.01 0.03 0.00 -0.01 0.00 0.08
## 143 -0.03 -0.58 -0.06 -0.04 -0.07 1.83_*
## 144 -0.03 -0.47 -0.05 -0.03 -0.06 1.51_*
## 145 0.03 -0.29 -0.03 0.08 -0.06 1.18_*
## cov.r cook.d hat
## 14 0.73_* 0.02 0.05
## 34 1.25_* 0.00 0.14
## 36 0.72_* 0.03 0.06
## 56 1.27_* 0.01 0.18
## 59 1.40_* 0.00 0.23
## 61 1.05 0.07 0.20
## 73 0.17_* 0.15 0.08
## 74 0.69_* 0.03 0.06
## 112 0.96 0.06 0.17
## 130 1.41_* 0.00 0.23_*
## 131 1.37_* 0.00 0.22
## 133 1.25_* 0.00 0.13
## 143 0.45_* 0.26 0.19
## 144 0.64_* 0.18 0.19
## 145 0.92 0.11 0.22
## dfb.1_ dfb.Lng1 dfb.Lng2 dfb.Lng3 dfb.Hght
## 14 -0.141616588 -0.1929315429 0.218982889 -0.131860707 0.2407305160
## 34 0.030440560 -0.0462153032 0.104886195 -0.108042122 0.0696512072
## 36 0.163232617 -0.0488938639 0.098306132 -0.101279759 -0.0426674983
## 56 0.022290078 0.0574691689 -0.038288970 -0.009655143 -0.0230484924
## 59 0.018040929 -0.0016477278 0.004612943 -0.010384556 -0.0137258729
## 61 -0.086555887 0.2532906094 -0.201878871 0.041163663 0.0890340521
## 73 0.606434482 0.2558296423 -0.028738594 -0.295617768 0.0763125406
## 74 0.006242169 0.1414483779 -0.193851039 0.130907288 0.0819001657
## 112 0.089522514 0.0434279233 -0.060712630 -0.028098785 -0.1363368027
## 130 0.004312081 -0.0003973092 0.005154002 -0.006778055 -0.0001156042
## 131 0.086122649 -0.0186653860 0.118213300 -0.144744712 0.0036573538
## 133 0.016359600 0.0097792744 0.010388821 -0.032783338 0.0327090729
## 143 -0.227460671 -0.3049476308 0.172959832 0.263531536 -0.5352820604
## 144 -0.186973292 -0.2506677840 0.142173453 0.216623641 -0.4400033133
## 145 -0.255315561 -0.4107836183 0.089871107 0.414278901 -0.5945522588
## dfb.Wdth dfb.fctr(Spcs)Prk dfb.fctr(Spcs)Prc dfb.fctr(Spcs)Pk
## 14 -0.057005595 0.127634525 0.123759382 0.228136901
## 34 -0.023864059 -0.055407351 -0.044044944 -0.004707184
## 36 0.049965389 -0.134595482 -0.116295411 -0.051909245
## 56 0.056696904 -0.019612601 -0.027971878 -0.018977680
## 59 0.053373569 -0.013659389 -0.019010921 0.002721119
## 61 -0.198877437 0.054213874 0.080755059 -0.022709299
## 73 0.017302670 -0.488796208 -0.200666725 0.004777082
## 74 -0.202101728 0.026201588 0.164585525 0.101960252
## 112 0.704040862 -0.051732791 -0.103979036 0.119371823
## 130 -0.002100050 -0.004895387 -0.005206735 -0.006101511
## 131 -0.057172961 -0.100231704 -0.105869386 -0.128582745
## 133 0.013880589 -0.018371572 -0.008186606 0.031689176
## 143 -0.675438601 0.183397864 -0.030573190 -0.576548851
## 144 -0.555212371 0.150753544 -0.025131245 -0.473924727
## 145 0.002653725 0.272890884 0.026155515 -0.289650884
## dfb.f(S)R dfb.f(S)S dfb.f(S)W dffit cov.r cook.d
## 14 0.2180407210 0.189940693 0.150838651 -0.543384478 0.7270559 2.385827e-02
## 34 -0.0111900976 -0.017900539 -0.032191666 0.156793491 1.2508137 2.060631e-03
## 36 0.0107610589 -0.134322673 -0.098549398 0.623309759 0.7193873 3.133267e-02
## 56 -0.0302690499 -0.024269611 -0.161098989 -0.348857905 1.2669205 1.017294e-02
## 59 -0.0178246711 -0.012409254 0.018512141 0.112703395 1.3962362 1.065436e-03
## 61 0.0698430795 0.047469008 0.407856443 0.894170047 1.0545803 6.567149e-02
## 73 -0.3429270599 -0.466616754 -0.188641465 1.457103401 0.1714125 1.517281e-01
## 74 0.0593840165 0.010114573 0.135172161 0.618633448 0.6891464 3.076807e-02
## 112 -0.1224779451 -0.032814716 -0.164170353 0.863355569 0.9605187 6.097730e-02
## 130 -0.0037926953 -0.004482028 -0.004209055 -0.009972483 1.4071660 8.344279e-06
## 131 -0.0741396186 -0.089921539 -0.084324939 -0.221023729 1.3697819 4.093953e-03
## 133 -0.0007786165 -0.005038435 -0.003168836 0.080498948 1.2453740 5.435461e-04
## 143 -0.0632140761 -0.036780579 -0.068403619 1.832225575 0.4528524 2.572007e-01
## 144 -0.0519621429 -0.030233736 -0.056227961 1.506094419 0.6381089 1.788258e-01
## 145 -0.0303588844 0.084209296 -0.063033399 1.181811945 0.9198758 1.132597e-01
## hat
## 14 0.05004871
## 34 0.14273832
## 36 0.06182012
## 56 0.18111993
## 59 0.22543609
## 61 0.20285900
## 73 0.07709637
## 74 0.05666553
## 112 0.16605984
## 130 0.22871224
## 131 0.21890489
## 133 0.13154547
## 143 0.19464247
## 144 0.19464247
## 145 0.21622470
# Determining the minimum value in the response variable
min_value <- min(fish_df$Weight)
# If the minimum value is less than or equal to zero, adjusting the data
if(min_value <= 0) {
offset <- abs(min_value) + 1 # Ensuring all values are positive
fish_df$Weight_adjusted <- log(fish_df$Weight + offset)
} else {
fish_df$Weight_adjusted <- log(fish_df$Weight)
}
# Refitting the model with the log-transformed variable
lm_model_log <- lm(Weight_adjusted ~ Length1 + Length2 + Length3 + Height + Width + factor(Species), data = fish_df)
Adequacy Check
check_model_adequacy(lm_model_log)
##
## Call:
## lm(formula = Weight_adjusted ~ Length1 + Length2 + Length3 +
## Height + Width + factor(Species), data = fish_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.4730 -0.0723 0.0416 0.1561 0.4420
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.10901 0.58563 3.601 0.000432 ***
## Length1 0.07793 0.16718 0.466 0.641796
## Length2 -0.23099 0.21068 -1.096 0.274687
## Length3 0.20581 0.13502 1.524 0.129582
## Height 0.03297 0.06017 0.548 0.584559
## Width 0.21191 0.11020 1.923 0.056406 .
## factor(Species)Parkki 0.38596 0.34884 1.106 0.270353
## factor(Species)Perch 0.44599 0.55443 0.804 0.422465
## factor(Species)Pike 0.09975 0.62352 0.160 0.873115
## factor(Species)Roach 0.00239 0.42077 0.006 0.995476
## factor(Species)Smelt -0.81897 0.55036 -1.488 0.138879
## factor(Species)Whitefish 0.46379 0.44542 1.041 0.299479
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4324 on 147 degrees of freedom
## Multiple R-squared: 0.907, Adjusted R-squared: 0.9
## F-statistic: 130.3 on 11 and 147 DF, p-value: < 2.2e-16
## Analysis of Variance Table
##
## Response: Weight_adjusted
## Df Sum Sq Mean Sq F value Pr(>F)
## Length1 1 218.409 218.409 1168.2779 < 2.2e-16 ***
## Length2 1 21.874 21.874 117.0029 < 2.2e-16 ***
## Length3 1 1.815 1.815 9.7096 0.002205 **
## Height 1 12.469 12.469 66.6989 1.323e-13 ***
## Width 1 3.491 3.491 18.6738 2.842e-05 ***
## factor(Species) 6 9.845 1.641 8.7772 3.547e-08 ***
## Residuals 147 27.482 0.187
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 31.67918, Df = 1, p = 1.8186e-08
##
## Durbin-Watson test
##
## data: model
## DW = 1.8983, p-value = 0.08929
## alternative hypothesis: true autocorrelation is greater than 0
##
##
## Shapiro-Wilk normality test
##
## data: residuals(model)
## W = 0.43, p-value < 2.2e-16
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 9.5386, df = 11, p-value = 0.5723
##
## GVIF Df GVIF^(1/(2*Df))
## Length1 2360.42508 1 48.584206
## Length2 4307.91811 1 65.634732
## Length3 2076.93715 1 45.573426
## Height 56.20370 1 7.496913
## Width 29.16651 1 5.400602
## factor(Species) 1509.77571 6 1.840388
## rstudent unadjusted p-value Bonferroni p
## 41 -22.48584 2.8169e-49 4.4789e-47
## Potentially influential observations of
## lm(formula = Weight_adjusted ~ Length1 + Length2 + Length3 + Height + Width + factor(Species), data = fish_df) :
##
## dfb.1_ dfb.Lng1 dfb.Lng2 dfb.Lng3 dfb.Hght dfb.Wdth dfb.fctr(Spcs)Prk
## 30 0.03 0.02 -0.02 0.00 -0.04 0.01 -0.02
## 34 -0.03 0.05 -0.11 0.11 -0.07 0.02 0.06
## 41 -1.28_* -0.37 -1.15_* 2.23_* -1.23_* 0.32 1.45_*
## 56 -0.01 -0.02 0.01 0.00 0.01 -0.02 0.01
## 57 -0.01 -0.01 0.01 0.01 0.00 -0.01 0.01
## 58 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## 59 -0.02 0.00 0.00 0.01 0.01 -0.05 0.01
## 60 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## 61 0.01 -0.03 0.02 0.00 -0.01 0.02 -0.01
## 73 -0.44 -0.18 0.02 0.21 -0.05 -0.01 0.35
## 112 0.00 0.00 0.00 0.00 0.00 0.02 0.00
## 130 -0.02 0.00 -0.02 0.03 0.00 0.01 0.02
## 131 0.04 -0.01 0.06 -0.07 0.00 -0.03 -0.05
## 143 0.02 0.03 -0.01 -0.02 0.05 0.06 -0.02
## 144 0.02 0.03 -0.02 -0.03 0.06 0.07 -0.02
## 145 0.21 0.33 -0.07 -0.33 0.48 0.00 -0.22
## dfb.fctr(Spcs)Prc dfb.fctr(Spcs)Pk dfb.f(S)R dfb.f(S)S dfb.f(S)W dffit
## 30 -0.02 -0.04 -0.03 -0.03 -0.02 -0.06
## 34 0.05 0.00 0.01 0.02 0.03 -0.16
## 41 1.06_* 0.13 -0.60 0.93 0.69 -5.77_*
## 56 0.01 0.01 0.01 0.01 0.06 0.14
## 57 0.01 0.00 0.00 0.00 0.04 0.09
## 58 0.00 0.00 0.00 0.00 0.00 0.01
## 59 0.02 0.00 0.02 0.01 -0.02 -0.12
## 60 0.00 0.00 0.00 0.00 -0.01 -0.04
## 61 -0.01 0.00 -0.01 -0.01 -0.04 -0.10
## 73 0.14 0.00 0.25 0.34 0.14 -1.05_*
## 112 0.00 0.00 0.00 0.00 -0.01 0.03
## 130 0.02 0.03 0.02 0.02 0.02 0.05
## 131 -0.05 -0.06 -0.04 -0.04 -0.04 -0.10
## 143 0.00 0.05 0.01 0.00 0.01 -0.16
## 144 0.00 0.06 0.01 0.00 0.01 -0.20
## 145 -0.02 0.23 0.02 -0.07 0.05 -0.95_*
## cov.r cook.d hat
## 30 1.26_* 0.00 0.14
## 34 1.25_* 0.00 0.14
## 41 0.00_* 0.63 0.06
## 56 1.32_* 0.00 0.18
## 57 1.32_* 0.00 0.18
## 58 1.31_* 0.00 0.17
## 59 1.40_* 0.00 0.23
## 60 1.32_* 0.00 0.18
## 61 1.36_* 0.00 0.20
## 73 0.42_* 0.08 0.08
## 112 1.30_* 0.00 0.17
## 130 1.41_* 0.00 0.23_*
## 131 1.39_* 0.00 0.22
## 143 1.34_* 0.00 0.19
## 144 1.33_* 0.00 0.19
## 145 1.06 0.07 0.22
## dfb.1_ dfb.Lng1 dfb.Lng2 dfb.Lng3 dfb.Hght
## 30 2.580657e-02 0.0223299207 -0.0154863393 0.0014146489 -0.0397794935
## 34 -3.167602e-02 0.0480910000 -0.1091431122 0.1124271259 -0.0724780748
## 41 -1.277672e+00 -0.3652168932 -1.1501951380 2.2307934786 -1.2298503611
## 56 -8.628075e-03 -0.0222452468 0.0148209484 0.0037373265 0.0089216429
## 57 -5.743301e-03 -0.0131788974 0.0050813933 0.0087001318 -0.0027703164
## 58 1.113439e-05 -0.0005213034 0.0003499015 0.0001243448 -0.0001206951
## 59 -1.856940e-02 0.0016959945 -0.0047480692 0.0106887494 0.0141279431
## 60 -2.214342e-03 -0.0043581698 0.0009228821 0.0038357026 -0.0005247825
## 61 9.340263e-03 -0.0273326402 0.0217847893 -0.0044419791 -0.0096076823
## 73 -4.359304e-01 -0.1839010239 0.0206585007 0.2125023894 -0.0548566391
## 112 2.798658e-03 0.0013576464 -0.0018980020 -0.0008784260 -0.0042621696
## 130 -2.015384e-02 0.0018569474 -0.0240888207 0.0316793318 0.0005403120
## 131 4.070483e-02 -0.0088219699 0.0558720924 -0.0684118449 0.0017286042
## 143 1.968342e-02 0.0263887900 -0.0149671623 -0.0228048283 0.0463208907
## 144 2.466371e-02 0.0330656664 -0.0187541451 -0.0285748929 0.0580409758
## 145 2.059868e-01 0.3314173449 -0.0725073794 -0.3342373143 0.4796805964
## dfb.Wdth dfb.fctr(Spcs)Prk dfb.fctr(Spcs)Prc dfb.fctr(Spcs)Pk
## 30 0.0115717348 -1.620837e-02 -2.356760e-02 -0.0362766148
## 34 0.0248326070 5.765612e-02 4.583255e-02 0.0048982302
## 41 0.3218125404 1.450561e+00 1.055103e+00 0.1274248028
## 56 -0.0219463175 7.591673e-03 1.082739e-02 0.0073459070
## 57 -0.0130984543 6.345264e-03 6.542872e-03 -0.0005500383
## 58 -0.0005679361 2.806242e-05 6.821756e-05 -0.0001620613
## 59 -0.0549370345 1.405951e-02 1.956781e-02 -0.0028008282
## 60 -0.0040868324 2.624618e-03 3.056877e-03 0.0011306529
## 61 0.0214609039 -5.850230e-03 -8.714295e-03 0.0024505650
## 73 -0.0124378813 3.513671e-01 1.442476e-01 -0.0034339660
## 112 0.0220097692 -1.617274e-03 -3.250599e-03 0.0037318093
## 130 0.0098152325 2.288010e-02 2.433528e-02 0.0285172975
## 131 -0.0270221113 -4.737331e-02 -5.003789e-02 -0.0607730861
## 143 0.0584494043 -1.587042e-02 2.645666e-03 0.0498919322
## 144 0.0732382390 -1.988595e-02 3.315070e-03 0.0625155602
## 145 -0.0021410064 -2.201665e-01 -2.110209e-02 0.2336883034
## dfb.f(S)R dfb.f(S)S dfb.f(S)W dffit cov.r
## 30 -3.070229e-02 -3.116635e-02 -0.022421058 -0.057170418 1.264957e+00
## 34 1.164426e-02 1.862705e-02 0.033498199 -0.163157120 1.249559e+00
## 41 -5.961052e-01 9.337748e-01 0.688954220 -5.767246675 1.851830e-08
## 56 1.171659e-02 9.394315e-03 0.062358424 0.135036409 1.316471e+00
## 57 4.910575e-03 4.347414e-03 0.039150132 0.088185560 1.323355e+00
## 58 2.135671e-05 -4.216138e-05 0.002893397 0.007203756 1.309492e+00
## 59 1.834681e-02 1.277276e-02 -0.019054415 -0.116004801 1.395939e+00
## 60 2.220594e-03 2.003827e-03 -0.013629016 -0.042296773 1.315864e+00
## 61 -7.536781e-03 -5.122390e-03 -0.044011870 -0.096490068 1.357478e+00
## 73 2.465103e-01 3.354236e-01 0.135603358 -1.047426737 4.183338e-01
## 112 -3.828913e-03 -1.025856e-03 -0.005132304 0.026990275 1.301093e+00
## 130 1.772633e-02 2.094814e-02 0.019672317 0.046609478 1.406358e+00
## 131 -3.504120e-02 -4.250033e-02 -0.039855167 -0.104464204 1.385097e+00
## 143 5.470260e-03 3.182825e-03 0.005919340 -0.158552522 1.336206e+00
## 144 6.854343e-03 3.988142e-03 0.007417048 -0.198669390 1.329716e+00
## 145 2.449334e-02 -6.793947e-02 0.050854905 -0.953477596 1.059399e+00
## cook.d hat
## 30 2.742003e-04 0.14335488
## 34 2.231105e-03 0.14273832
## 41 6.252929e-01 0.06172339
## 56 1.529114e-03 0.18111993
## 57 6.523405e-04 0.18219171
## 58 4.354121e-06 0.17117644
## 59 1.128750e-03 0.22543609
## 60 1.500972e-04 0.17574171
## 61 7.809795e-04 0.20285900
## 73 8.445448e-02 0.07709637
## 112 6.112051e-05 0.16605984
## 130 1.822678e-04 0.22871224
## 131 9.153821e-04 0.21890489
## 143 2.107756e-03 0.19464247
## 144 3.307955e-03 0.19464247
## 145 7.459516e-02 0.21622470
# Assuming heteroscedasticity pattern is identified, e.g., variance increasing with a predictor
weights <- 1 / fitted(lm_model_log)^2
# Fitting the WLS model
wls_model <- lm(Weight ~ Length1 + Length2 + Length3 + Height + Width + factor(Species), data = fish_df, weights = weights)
Adequacy Check
check_model_adequacy(wls_model)
##
## Call:
## lm(formula = Weight ~ Length1 + Length2 + Length3 + Height +
## Width + factor(Species), data = fish_df, weights = weights)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -35.466 -11.460 -3.076 7.407 88.865
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -654.91 132.79 -4.932 2.18e-06 ***
## Length1 16.43 37.30 0.440 0.6604
## Length2 -2.27 43.91 -0.052 0.9588
## Length3 14.03 29.72 0.472 0.6375
## Height 15.18 15.03 1.010 0.3142
## Width 13.64 26.78 0.509 0.6112
## factor(Species)Parkki 58.05 78.68 0.738 0.4618
## factor(Species)Perch 81.72 123.71 0.661 0.5099
## factor(Species)Pike -127.89 148.86 -0.859 0.3917
## factor(Species)Roach 25.82 97.39 0.265 0.7913
## factor(Species)Smelt 277.07 124.54 2.225 0.0276 *
## factor(Species)Whitefish 58.86 102.39 0.575 0.5663
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.27 on 147 degrees of freedom
## Multiple R-squared: 0.9224, Adjusted R-squared: 0.9166
## F-statistic: 158.9 on 11 and 147 DF, p-value: < 2.2e-16
## Analysis of Variance Table
##
## Response: Weight
## Df Sum Sq Mean Sq F value Pr(>F)
## Length1 1 532327 532327 1595.4483 < 2.2e-16 ***
## Length2 1 27 27 0.0805 0.7770436
## Length3 1 5333 5333 15.9846 0.0001007 ***
## Height 1 15423 15423 46.2231 2.465e-10 ***
## Width 1 39 39 0.1163 0.7335544
## factor(Species) 6 29916 4986 14.9437 2.658e-13 ***
## Residuals 147 49047 334
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 0.7019662, Df = 1, p = 0.40212
## Skipping Durbin-Watson test: weighted regressions are not supported.
##
## Shapiro-Wilk normality test
##
## data: residuals(model)
## W = 0.87148, p-value = 1.866e-10
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 1.1714, df = 11, p-value = 0.9999
##
## GVIF Df GVIF^(1/(2*Df))
## Length1 2798.60652 1 52.901857
## Length2 4557.74577 1 67.511079
## Length3 2502.78976 1 50.027890
## Height 89.50428 1 9.460670
## Width 49.94579 1 7.067233
## factor(Species) 2370.09050 6 1.910867
## rstudent unadjusted p-value Bonferroni p
## 73 5.920809 2.1953e-08 3.4905e-06
## 143 3.915135 1.3821e-04 2.1975e-02
## Potentially influential observations of
## lm(formula = Weight ~ Length1 + Length2 + Length3 + Height + Width + factor(Species), data = fish_df, weights = weights) :
##
## dfb.1_ dfb.Lng1 dfb.Lng2 dfb.Lng3 dfb.Hght dfb.Wdth dfb.fctr(Spcs)Prk
## 36 0.21 -0.06 0.10 -0.10 -0.05 0.07 -0.16
## 56 0.02 0.04 -0.03 -0.01 -0.02 0.04 -0.02
## 57 0.02 0.05 -0.02 -0.03 0.01 0.05 -0.02
## 58 0.00 0.02 -0.01 -0.01 0.01 0.03 0.00
## 59 0.01 0.00 0.00 0.00 0.00 0.02 0.00
## 73 1.19_* 0.60 -0.11 -0.66 0.19 0.21 -0.99
## 130 0.07 0.01 0.09 -0.12 0.01 -0.05 -0.09
## 131 0.12 0.00 0.17 -0.22 0.03 -0.12 -0.15
## 143 -0.17 -0.29 0.18 0.28 -0.59 -0.63 0.15
## 144 -0.15 -0.26 0.16 0.25 -0.52 -0.56 0.13
## 145 -0.21 -0.38 0.09 0.42 -0.62 -0.06 0.24
## 157 -0.06 0.09 -0.17 0.14 -0.02 0.06 0.08
## dfb.fctr(Spcs)Prc dfb.fctr(Spcs)Pk dfb.f(S)R dfb.f(S)S dfb.f(S)W dffit
## 36 -0.14 -0.04 0.00 -0.16 -0.11 0.84
## 56 -0.02 -0.01 -0.02 -0.02 -0.15 -0.30
## 57 -0.02 0.01 -0.01 -0.01 -0.17 -0.36
## 58 0.00 0.01 0.00 0.01 -0.19 -0.43
## 59 -0.01 0.00 -0.01 0.00 0.01 0.04
## 73 -0.48 0.06 -0.66 -0.89 -0.38 2.66_*
## 130 -0.09 -0.11 -0.06 -0.08 -0.07 -0.19
## 131 -0.15 -0.19 -0.10 -0.13 -0.11 -0.36
## 143 -0.09 -0.62 -0.14 -0.11 -0.13 1.66_*
## 144 -0.08 -0.55 -0.12 -0.10 -0.12 1.47_*
## 145 -0.02 -0.36 -0.09 0.01 -0.11 1.15_*
## 157 0.08 0.07 0.05 0.04 0.06 -0.27
## cov.r cook.d hat
## 36 0.70_* 0.06 0.10
## 56 1.35_* 0.01 0.21
## 57 1.33_* 0.01 0.21
## 58 1.25_* 0.02 0.19
## 59 1.36_* 0.00 0.20
## 73 0.10_* 0.48 0.17
## 130 1.36_* 0.00 0.21
## 131 1.29_* 0.01 0.20
## 143 0.39_* 0.21 0.15
## 144 0.50_* 0.17 0.15
## 145 0.65_* 0.11 0.14
## 157 1.26_* 0.01 0.16
## dfb.1_ dfb.Lng1 dfb.Lng2 dfb.Lng3 dfb.Hght
## 36 0.206052805 -0.056811547 0.1043450978 -0.099558874 -0.050484435
## 56 0.018968422 0.039542021 -0.0274747530 -0.008358706 -0.018873950
## 57 0.020101876 0.045379369 -0.0171413541 -0.034781468 0.014603095
## 58 -0.002116319 0.019007351 -0.0135381706 -0.006686314 0.008369706
## 59 0.005986070 0.001778305 -0.0006142875 -0.003549798 -0.003183238
## 73 1.186158483 0.599659977 -0.1097381731 -0.659959083 0.193781619
## 130 0.073830069 0.010453289 0.0870022745 -0.124905713 0.008653279
## 131 0.120651566 0.001938487 0.1693636484 -0.222417602 0.026855291
## 143 -0.170514830 -0.294142350 0.1769741837 0.284100717 -0.591004853
## 144 -0.150851954 -0.260223397 0.1565664494 0.251339713 -0.522853275
## 145 -0.207167706 -0.379778506 0.0869031996 0.420939330 -0.617639295
## 157 -0.061733866 0.086927432 -0.1738300220 0.138269569 -0.018732609
## dfb.Wdth dfb.fctr(Spcs)Prk dfb.fctr(Spcs)Prc dfb.fctr(Spcs)Pk
## 36 0.06793502 -0.1640531347 -1.387828e-01 -0.03943022
## 56 0.04497524 -0.0163987627 -2.217259e-02 -0.01438838
## 57 0.05013782 -0.0227499346 -2.018855e-02 0.00904348
## 58 0.02896943 0.0002550724 -2.325441e-05 0.01274061
## 59 0.01965630 -0.0045292588 -5.804950e-03 0.00266487
## 73 0.21116628 -0.9888008438 -4.754802e-01 0.05611569
## 130 -0.05296619 -0.0877934237 -8.735768e-02 -0.10648186
## 131 -0.11700754 -0.1484982666 -1.455975e-01 -0.18657903
## 143 -0.62905977 0.1459852878 -8.747702e-02 -0.61669433
## 144 -0.55651990 0.1291510304 -7.738963e-02 -0.54558037
## 145 -0.05977564 0.2356574591 -2.017776e-02 -0.36007292
## 157 0.06372746 0.0788105324 8.342151e-02 0.07115814
## dfb.f(S)R dfb.f(S)S dfb.f(S)W dffit cov.r cook.d
## 36 0.003456882 -0.163248558 -0.107946546 0.8384556 0.70102278 0.0564031993
## 56 -0.024343166 -0.020021726 -0.149544715 -0.2959972 1.34561131 0.0073350729
## 57 -0.014378770 -0.012574706 -0.171802352 -0.3618836 1.32555490 0.0109517752
## 58 0.002005395 0.005284479 -0.190461954 -0.4323664 1.25395153 0.0155996057
## 59 -0.005478972 -0.003559482 0.009685473 0.0416554 1.36357145 0.0001455814
## 73 -0.662292491 -0.888558402 -0.383354240 2.6646150 0.09866255 0.4803880044
## 130 -0.061981921 -0.078015105 -0.066239224 -0.1942369 1.36228149 0.0031625020
## 131 -0.097866848 -0.128345879 -0.107659848 -0.3613955 1.29324399 0.0109185461
## 143 -0.136521114 -0.113664097 -0.133979231 1.6618256 0.38656774 0.2096990584
## 144 -0.120778215 -0.100556950 -0.118529449 1.4701926 0.49655472 0.1675852695
## 145 -0.090979857 0.010333574 -0.114302323 1.1543806 0.64613216 0.1057662985
## 157 0.045744119 0.042282900 0.062665472 -0.2662829 1.25554002 0.0059343685
## hat
## 36 0.09517273
## 56 0.21439073
## 57 0.21306457
## 58 0.18939258
## 59 0.20447308
## 73 0.16842554
## 130 0.21238311
## 131 0.19668353
## 143 0.15266292
## 144 0.15266292
## 145 0.13772783
## 157 0.16131194
# Adding a constant to 'Weight' to make all values positive if necessary
if(any(fish_df$Weight <= 0)) {
min_positive_value <- abs(min(fish_df$Weight[fish_df$Weight > 0]))
adjustment_factor <- min_positive_value + 0.1 # Ensure all values are strictly positive
fish_df$Adjusted_Weight <- fish_df$Weight + adjustment_factor
} else {
fish_df$Adjusted_Weight <- fish_df$Weight
}
b<-boxcox(lm(Adjusted_Weight ~ Length1 + Length2 + Length3 + Height + Width + factor(Species), data = fish_df))
lambda <- b$x[which.max(b$y)]
lambda_optimal <- 0.5
# Transforming the 'Weight' using the optimal lambda
fish_df$Transformed_Weight <- (fish_df$Adjusted_Weight^lambda_optimal - 1) / lambda_optimal
# Fitting the model with the transformed 'Weight'
lm_transformed <- lm(Transformed_Weight ~ Length1 + Length2 + Length3 + Height + Width + factor(Species), data = fish_df)
Adequacy Check
check_model_adequacy(lm_transformed)
##
## Call:
## lm(formula = Transformed_Weight ~ Length1 + Length2 + Length3 +
## Height + Width + factor(Species), data = fish_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.7061 -1.2644 -0.0962 1.2055 7.9910
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -23.2723 3.4487 -6.748 3.22e-10 ***
## Length1 0.2662 0.9845 0.270 0.78728
## Length2 -0.4179 1.2407 -0.337 0.73671
## Length3 1.3635 0.7951 1.715 0.08848 .
## Height 0.8889 0.3543 2.509 0.01320 *
## Width 1.9327 0.6489 2.978 0.00339 **
## factor(Species)Parkki 4.1837 2.0543 2.037 0.04349 *
## factor(Species)Perch 5.0973 3.2650 1.561 0.12063
## factor(Species)Pike -2.9773 3.6719 -0.811 0.41877
## factor(Species)Roach 2.0657 2.4779 0.834 0.40582
## factor(Species)Smelt 9.1688 3.2410 2.829 0.00532 **
## factor(Species)Whitefish 5.2000 2.6231 1.982 0.04930 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.546 on 147 degrees of freedom
## Multiple R-squared: 0.9815, Adjusted R-squared: 0.9801
## F-statistic: 709.6 on 11 and 147 DF, p-value: < 2.2e-16
## Analysis of Variance Table
##
## Response: Transformed_Weight
## Df Sum Sq Mean Sq F value Pr(>F)
## Length1 1 45398 45398 7002.208 < 2.2e-16 ***
## Length2 1 1558 1558 240.365 < 2.2e-16 ***
## Length3 1 524 524 80.746 1.155e-15 ***
## Height 1 2469 2469 380.769 < 2.2e-16 ***
## Width 1 267 267 41.112 1.846e-09 ***
## factor(Species) 6 389 65 10.004 2.967e-09 ***
## Residuals 147 953 6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 0.3102006, Df = 1, p = 0.57756
##
## Durbin-Watson test
##
## data: model
## DW = 1.7426, p-value = 0.009606
## alternative hypothesis: true autocorrelation is greater than 0
##
##
## Shapiro-Wilk normality test
##
## data: residuals(model)
## W = 0.86611, p-value = 1.023e-10
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 7.12, df = 11, p-value = 0.7893
##
## GVIF Df GVIF^(1/(2*Df))
## Length1 2360.42508 1 48.584206
## Length2 4307.91811 1 65.634732
## Length3 2076.93715 1 45.573426
## Height 56.20370 1 7.496913
## Width 29.16651 1 5.400602
## factor(Species) 1509.77571 6 1.840388
## rstudent unadjusted p-value Bonferroni p
## 41 -7.45782 7.2149e-12 1.1472e-09
## 14 -3.80379 2.0891e-04 3.3217e-02
## Potentially influential observations of
## lm(formula = Transformed_Weight ~ Length1 + Length2 + Length3 + Height + Width + factor(Species), data = fish_df) :
##
## dfb.1_ dfb.Lng1 dfb.Lng2 dfb.Lng3 dfb.Hght dfb.Wdth dfb.fctr(Spcs)Prk
## 14 -0.23 -0.31 0.35 -0.21 0.39 -0.09 0.21
## 41 -0.42 -0.12 -0.38 0.74 -0.41 0.11 0.48
## 56 0.01 0.03 -0.02 -0.01 -0.01 0.03 -0.01
## 57 0.02 0.04 -0.02 -0.03 0.01 0.04 -0.02
## 59 -0.02 0.00 0.00 0.01 0.01 -0.05 0.01
## 73 0.41 0.17 -0.02 -0.20 0.05 0.01 -0.33
## 130 -0.04 0.00 -0.04 0.06 0.00 0.02 0.04
## 131 0.14 -0.03 0.19 -0.23 0.01 -0.09 -0.16
## 143 -0.19 -0.25 0.14 0.22 -0.45 -0.56 0.15
## 144 -0.15 -0.21 0.12 0.18 -0.36 -0.46 0.12
## 145 0.02 0.03 -0.01 -0.03 0.04 0.00 -0.02
## dfb.fctr(Spcs)Prc dfb.fctr(Spcs)Pk dfb.f(S)R dfb.f(S)S dfb.f(S)W dffit
## 14 0.20 0.37 0.35 0.31 0.24 -0.87_*
## 41 0.35 0.04 -0.20 0.31 0.23 -1.91_*
## 56 -0.01 -0.01 -0.02 -0.01 -0.09 -0.18
## 57 -0.02 0.00 -0.02 -0.01 -0.13 -0.30
## 59 0.02 0.00 0.02 0.01 -0.02 -0.10
## 73 -0.13 0.00 -0.23 -0.31 -0.13 0.98_*
## 130 0.05 0.05 0.03 0.04 0.04 0.09
## 131 -0.17 -0.20 -0.12 -0.14 -0.13 -0.35
## 143 -0.03 -0.48 -0.05 -0.03 -0.06 1.53_*
## 144 -0.02 -0.39 -0.04 -0.02 -0.05 1.23_*
## 145 0.00 0.02 0.00 -0.01 0.00 -0.08
## cov.r cook.d hat
## 14 0.37_* 0.06 0.05
## 41 0.02_* 0.22 0.06
## 56 1.31_* 0.00 0.18
## 57 1.29_* 0.01 0.18
## 59 1.40_* 0.00 0.23
## 73 0.48_* 0.07 0.08
## 130 1.40_* 0.00 0.23_*
## 131 1.34_* 0.01 0.22
## 143 0.63_* 0.18 0.19
## 144 0.81 0.12 0.19
## 145 1.38_* 0.00 0.22
## dfb.1_ dfb.Lng1 dfb.Lng2 dfb.Lng3 dfb.Hght
## 14 -0.22754600 -0.309997587 0.351856239 -0.211870492 0.386799783
## 41 -0.42376205 -0.121130551 -0.381482275 0.739881558 -0.407901318
## 56 0.01181730 0.030467835 -0.020299268 -0.005118768 -0.012219381
## 57 0.01923502 0.044137741 -0.017018209 -0.029137807 0.009278129
## 59 -0.01603207 0.001464253 -0.004099291 0.009228234 0.012197495
## 73 0.40666693 0.171555968 -0.019271720 -0.198237358 0.051174178
## 130 -0.03736911 0.003443138 -0.044665317 0.058739587 0.001001843
## 131 0.13691200 -0.029672976 0.187927557 -0.230105413 0.005814215
## 143 -0.18952056 -0.254082801 0.144110379 0.219574852 -0.445997777
## 144 -0.15329102 -0.205511277 0.116561640 0.177600011 -0.360738988
## 145 0.01791805 0.028828798 -0.006307155 -0.029074097 0.041725683
## dfb.Wdth dfb.fctr(Spcs)Prk dfb.fctr(Spcs)Prc dfb.fctr(Spcs)Pk
## 14 -0.0915951668 0.20507997 0.198853485 0.366564678
## 41 0.1067347408 0.48110370 0.349943340 0.042262658
## 56 0.0300584119 -0.01039781 -0.014829562 -0.010061200
## 57 0.0438683277 -0.02125107 -0.021912879 0.001842146
## 59 -0.0474304148 0.01213841 0.016894052 -0.002418122
## 73 0.0116029412 -0.32778026 -0.134564447 0.003203448
## 130 0.0181993330 0.04242412 0.045122306 0.052876566
## 131 -0.0908897293 -0.15934163 -0.168304032 -0.204412205
## 143 -0.5627764062 0.15280736 -0.025473625 -0.480381325
## 144 -0.4551937286 0.12359607 -0.020603981 -0.388549634
## 145 -0.0001862384 -0.01915148 -0.001835594 0.020327702
## dfb.f(S)R dfb.f(S)S dfb.f(S)W dffit cov.r cook.d
## 14 0.350342387 0.305191964 0.242363778 -0.87309661 0.36763669 0.0581929058
## 41 -0.197708700 0.309702684 0.228503681 -1.91280793 0.02404903 0.2223039518
## 56 -0.016047429 -0.012866769 -0.085408187 -0.18495039 1.30868729 0.0028670413
## 57 -0.016446116 -0.014560023 -0.131118587 -0.29534424 1.28518200 0.0072992311
## 59 0.015839891 0.011027482 -0.016450812 -0.10015386 1.39728958 0.0008414263
## 73 -0.229962342 -0.312907011 -0.126500466 0.97711423 0.47602589 0.0742919358
## 130 0.032868038 0.038841896 0.036476268 0.08642296 1.40429487 0.0006265656
## 131 -0.117862182 -0.142951218 -0.134054120 -0.35136867 1.34019696 0.0103276345
## 143 -0.052670058 -0.030645631 -0.056993993 1.52661296 0.62555649 0.1834275580
## 144 -0.042601431 -0.024787285 -0.046098784 1.23477928 0.81119970 0.1226282314
## 145 0.002130587 -0.005909809 0.004423685 -0.08293957 1.38194953 0.0005770755
## hat
## 14 0.05004871
## 41 0.06172339
## 56 0.18111993
## 57 0.18219171
## 59 0.22543609
## 73 0.07709637
## 130 0.22871224
## 131 0.21890489
## 143 0.19464247
## 144 0.19464247
## 145 0.21622470
# Defining a list of models
models <- list(
lm_model = lm_model,#linear model
lm_model_log = lm_model_log, #log transformed model
wls_model = wls_model, #wls model
lm_transformed = lm_transformed #box cox transformed model
)
# Setup for comparison
model_comparisons <- data.frame(
Model = character(),
RSS = numeric(),
R2 = numeric(),
AdjR2 = numeric(),
Cp = numeric(),
BIC = numeric(),
stringsAsFactors = FALSE
)
mse_full <- sum(resid(lm_model)^2) / lm_model$df.residual
for (model_name in names(models)) {
model <- models[[model_name]]
model_summary <- summary(model)
rss <- sum(resid(model)^2)
r2 <- model_summary$r.squared
adj_r2 <- model_summary$adj.r.squared
n <- nrow(fish_df) # Number of observations
p <- length(coef(model)) # Number of predictors in the model
cp <- (rss / mse_full) - (n - 2 * p) # Calculating Cp
bic <- AIC(model, k = log(n)) # Using AIC function to compute BIC approximation
model_comparisons <- rbind(model_comparisons, data.frame(
Model = model_name,
RSS = rss,
R2 = r2,
AdjR2 = adj_r2,
Cp = cp,
BIC = bic
))
}
# Print the model comparisons
print(model_comparisons)
## Model RSS R2 AdjR2 Cp BIC
## 1 lm_model 1.294118e+06 0.9360849 0.9313021 12.00000 1948.8235
## 2 lm_model_log 2.748158e+01 0.9069636 0.9000017 -134.99688 238.0115
## 3 wls_model 1.596616e+06 0.9224076 0.9166013 46.36094 1952.6118
## 4 lm_transformed 9.530486e+02 0.9815146 0.9801313 -134.89174 801.8493
# Optionally, identify best model by criteria
best_by_adj_r2 <- model_comparisons[which.max(model_comparisons$AdjR2),]
print(paste("Best model by Adjusted R2:", best_by_adj_r2$Model))
## [1] "Best model by Adjusted R2: lm_transformed"
# Perform subset selection using regsubsets
subset_selection <- leaps::regsubsets(Transformed_Weight ~ Length1 + Length2 + Length3 + Height + Width + factor(Species),
data = fish_df,
nbest = 1,# To keep the best model of each size
really.big = TRUE,
method = "forward")
# Summarize the results
subset_summary <- summary(subset_selection)
Below plots were generated to visually assess the model’s performance based on Mallows’ Cp, BIC, and Adjusted R-squared criteria, aiding in the selection of the best subset of predictors.
plot(subset_selection,scale="Cp")
plot(subset_selection,scale="bic")
plot(subset_selection,scale="adjr2")
#### Evaluating Model Selection Criteria
RSS = subset_summary$rss
r2 = subset_summary$rsq
Cp = subset_summary$cp
BIC = subset_summary$bic
Adj_r2 = subset_summary$adjr2
cbind(RSS, r2, Cp, BIC, Adj_r2)
## RSS r2 Cp BIC Adj_r2
## [1,] 4556.0198 0.9116311 547.728996 -375.6337 0.9110683
## [2,] 1512.2457 0.9706684 80.251593 -545.9185 0.9702923
## [3,] 1305.3906 0.9746806 50.345877 -564.2375 0.9741905
## [4,] 1126.7396 0.9781457 24.790423 -582.5692 0.9775780
## [5,] 1044.5853 0.9797392 14.118792 -589.5379 0.9790770
## [6,] 1000.8034 0.9805884 9.365790 -591.2769 0.9798221
## [7,] 990.7434 0.9807835 9.814102 -587.8144 0.9798926
## [8,] 980.6719 0.9809788 10.260667 -584.3700 0.9799644
plot(Cp, xlab = "Number of Predictors", ylab = "Cp", type = 'l', lwd = 2)
points(which.min(Cp), Cp[which.min(Cp)], col = "red", cex = 2, pch = 8, lwd = 2)
plot(BIC, xlab = "Number of Predictors", ylab = "BIC", type = 'l', lwd = 2)
points(which.min(BIC), BIC[which.min(BIC)], col = "red", cex = 2, pch = 8, lwd = 2)
plot(Adj_r2, xlab = "Number of Predictors", ylab = "Adjusted R-squared", type = 'l', lwd = 2)
points(which.max(Adj_r2), Adj_r2[which.max(Adj_r2)], col = "red", cex = 2, pch = 8, lwd = 2)
#### Optimal coefficients for each criterion
coef(subset_selection, which.min(Cp))
## (Intercept) Length3 Width
## -18.700956 1.320857 2.799469
## factor(Species)Parkki factor(Species)Pike factor(Species)Roach
## 2.241776 -10.724132 -2.473789
## factor(Species)Smelt
## 3.970063
coef(subset_selection, which.min(BIC))
## (Intercept) Length3 Width
## -18.700956 1.320857 2.799469
## factor(Species)Parkki factor(Species)Pike factor(Species)Roach
## 2.241776 -10.724132 -2.473789
## factor(Species)Smelt
## 3.970063
coef(subset_selection, which.max(Adj_r2))
## (Intercept) Length2 Length3
## -18.3413926 0.6423043 0.6073081
## Height Width factor(Species)Parkki
## 0.4622490 2.6076994 1.5514236
## factor(Species)Pike factor(Species)Roach factor(Species)Smelt
## -8.1250754 -1.7165739 4.4906379
# Extracting the best set of predictors based on BIC
best_model_vars_bic <- names(coef(subset_selection, which.min(subset_summary$bic)))
best_model_vars_bic
## [1] "(Intercept)" "Length3" "Width"
## [4] "factor(Species)Parkki" "factor(Species)Pike" "factor(Species)Roach"
## [7] "factor(Species)Smelt"
# Extracting the best set of predictors based on cp
best_model_vars_cp <- names(coef(subset_selection, which.min(subset_summary$cp)))
best_model_vars_cp
## [1] "(Intercept)" "Length3" "Width"
## [4] "factor(Species)Parkki" "factor(Species)Pike" "factor(Species)Roach"
## [7] "factor(Species)Smelt"
# Extracting the best set of predictors based on adjr2
best_model_vars_adjr2 <- names(coef(subset_selection, which.max(subset_summary$adjr2)))
best_model_vars_adjr2
## [1] "(Intercept)" "Length2" "Length3"
## [4] "Height" "Width" "factor(Species)Parkki"
## [7] "factor(Species)Pike" "factor(Species)Roach" "factor(Species)Smelt"
best_subset_model_bic <- lm(Transformed_Weight ~ Length3 + Width + Species,
data = fish_df,
subset = Species %in% c("Parkki", "Pike", "Roach", "Smelt"))
# Check the summary of the model
check_model_adequacy(best_subset_model_bic)
##
## Call:
## lm(formula = Transformed_Weight ~ Length3 + Width + Species,
## data = fish_df, subset = Species %in% c("Parkki", "Pike",
## "Roach", "Smelt"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.5443 -0.8635 0.1469 1.2890 4.2608
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -15.19148 1.72648 -8.799 3.81e-12 ***
## Length3 1.57677 0.16242 9.708 1.32e-13 ***
## Width 0.59493 1.27061 0.468 0.641440
## SpeciesPike -15.48805 2.43738 -6.354 4.01e-08 ***
## SpeciesRoach -4.30959 1.07258 -4.018 0.000177 ***
## SpeciesSmelt 0.07884 1.55800 0.051 0.959823
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.771 on 56 degrees of freedom
## Multiple R-squared: 0.9798, Adjusted R-squared: 0.978
## F-statistic: 542.9 on 5 and 56 DF, p-value: < 2.2e-16
## Analysis of Variance Table
##
## Response: Transformed_Weight
## Df Sum Sq Mean Sq F value Pr(>F)
## Length3 1 20269.3 20269.3 2639.182 < 2.2e-16 ***
## Width 1 153.3 153.3 19.957 3.895e-05 ***
## Species 3 427.0 142.3 18.531 1.791e-08 ***
## Residuals 56 430.1 7.7
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 0.02888963, Df = 1, p = 0.86503
##
## Durbin-Watson test
##
## data: model
## DW = 2.0288, p-value = 0.3402
## alternative hypothesis: true autocorrelation is greater than 0
##
##
## Shapiro-Wilk normality test
##
## data: residuals(model)
## W = 0.73116, p-value = 2.479e-09
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 2.3849, df = 5, p-value = 0.7937
##
## GVIF Df GVIF^(1/(2*Df))
## Length3 44.82414 1 6.695083
## Width 30.34475 1 5.508607
## Species 25.54705 3 1.716156
## rstudent unadjusted p-value Bonferroni p
## 41 -8.952754 2.5149e-12 1.5593e-10
## Potentially influential observations of
## lm(formula = Transformed_Weight ~ Length3 + Width + Species, data = fish_df, subset = Species %in% c("Parkki", "Pike", "Roach", "Smelt")) :
##
## dfb.1_ dfb.Lng3 dfb.Wdth dfb.SpcP dfb.SpcR dfb.SpcS dffit cov.r cook.d
## 41 -0.42 -0.03 0.20 -0.14 -1.28_* 0.28 -2.11_* 0.01_* 0.31
## 142 -0.16 -0.09 0.15 0.05 -0.05 0.14 0.24 1.34_* 0.01
## 143 -0.29 0.98 -0.77 -0.80 0.07 -0.18 1.19_* 1.09 0.22
## 145 0.26 -0.08 -0.04 0.13 0.05 -0.14 -0.36 1.39_* 0.02
## hat
## 41 0.05
## 142 0.19
## 143 0.29
## 145 0.23
## dfb.1_ dfb.Lng3 dfb.Wdth dfb.SpcP dfb.SpcR dfb.SpcS
## 41 -0.4171618 -0.03093509 0.20397593 -0.14493410 -1.28373477 0.2813857
## 142 -0.1627784 -0.09302800 0.15282140 0.04966278 -0.04843667 0.1397822
## 143 -0.2857662 0.98235123 -0.76801878 -0.80252811 0.07354258 -0.1789287
## 145 0.2603089 -0.08251684 -0.03518008 0.12650917 0.04544527 -0.1378732
## dffit cov.r cook.d hat
## 41 -2.1113349 0.005341991 0.30784292 0.05268596
## 142 0.2442338 1.336152489 0.01007557 0.18903818
## 143 1.1863550 1.085055968 0.22466661 0.28860512
## 145 -0.3637708 1.388234000 0.02228065 0.23426752
best_subset_model_adjr2 <- lm(Transformed_Weight ~ Length3 + Length2 + Height + Width,
data = fish_df,
subset = Species %in% c("Parkki", "Pike", "Roach", "Smelt"))
# Check the summary of the model
check_model_adequacy(best_subset_model_adjr2)
##
## Call:
## lm(formula = Transformed_Weight ~ Length3 + Length2 + Height +
## Width, data = fish_df, subset = Species %in% c("Parkki",
## "Pike", "Roach", "Smelt"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.0108 -0.9036 0.6444 1.4476 8.9909
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11.9869 1.4045 -8.535 8.95e-12 ***
## Length3 0.5554 1.4688 0.378 0.70675
## Length2 0.5662 1.4524 0.390 0.69810
## Height 1.0364 0.3663 2.830 0.00643 **
## Width 0.2732 1.5208 0.180 0.85808
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.631 on 57 degrees of freedom
## Multiple R-squared: 0.9647, Adjusted R-squared: 0.9622
## F-statistic: 389.3 on 4 and 57 DF, p-value: < 2.2e-16
## Analysis of Variance Table
##
## Response: Transformed_Weight
## Df Sum Sq Mean Sq F value Pr(>F)
## Length3 1 20269.3 20269.3 1537.7339 < 2.2e-16 ***
## Length2 1 95.8 95.8 7.2655 0.009222 **
## Height 1 162.8 162.8 12.3512 0.000871 ***
## Width 1 0.4 0.4 0.0323 0.858083
## Residuals 57 751.3 13.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 9.747459, Df = 1, p = 0.0017957
##
## Durbin-Watson test
##
## data: model
## DW = 1.1906, p-value = 8.967e-05
## alternative hypothesis: true autocorrelation is greater than 0
##
##
## Shapiro-Wilk normality test
##
## data: residuals(model)
## W = 0.83149, p-value = 6.499e-07
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 2.4087, df = 4, p-value = 0.6611
##
## Length3 Length2 Height Width
## 2135.98235 1884.20734 4.59343 25.32757
## rstudent unadjusted p-value Bonferroni p
## 41 -6.093245 1.0715e-07 6.6435e-06
## Potentially influential observations of
## lm(formula = Transformed_Weight ~ Length3 + Length2 + Height + Width, data = fish_df, subset = Species %in% c("Parkki", "Pike", "Roach", "Smelt")) :
##
## dfb.1_ dfb.Lng3 dfb.Lng2 dfb.Hght dfb.Wdth dffit cov.r cook.d hat
## 41 -0.46 0.40 -0.35 0.21 -0.70 -1.14_* 0.09_* 0.16 0.03
## 130 0.15 -0.35 0.36 0.16 0.14 -0.39 1.35_* 0.03 0.23
## 143 -0.52 0.35 -0.29 0.27 -0.66 1.27_* 0.66_* 0.29 0.16
## 144 -0.44 0.30 -0.25 0.23 -0.56 1.07_* 0.79 0.21 0.16
## 158 0.00 0.00 0.00 0.00 0.00 0.01 1.27_* 0.00 0.14
## dfb.1_ dfb.Lng3 dfb.Lng2 dfb.Hght dfb.Wdth
## 41 -0.462827733 0.404165726 -0.354558300 0.210859814 -0.704806678
## 130 0.149023648 -0.354043037 0.355011501 0.162439298 0.142594827
## 143 -0.517495890 0.354126625 -0.292054438 0.269701309 -0.663340589
## 144 -0.436810677 0.298913081 -0.246518860 0.227650912 -0.559916045
## 158 0.004516853 -0.004303681 0.004209706 -0.001505875 0.003587604
## dffit cov.r cook.d hat
## 41 -1.142237994 0.08891625 1.597127e-01 0.03394817
## 130 -0.391244056 1.34688132 3.087175e-02 0.22580714
## 143 1.269374354 0.65609075 2.858429e-01 0.16319211
## 144 1.071460241 0.79206102 2.114750e-01 0.16319211
## 158 0.005399422 1.27338555 5.934852e-06 0.14203929
Prediction Error Sum of Squares (PRESS) Calculation
library(DAAG)
##
## Attaching package: 'DAAG'
## The following object is masked from 'package:MASS':
##
## hills
## The following object is masked from 'package:car':
##
## vif
press(best_subset_model_bic)
## [1] 518.6548
press(best_subset_model_adjr2)
## [1] 895.466
library(caret)
## Loading required package: lattice
# Setting up cross-validation
set.seed(123) # for reproducibility
train_control <- trainControl(method = "cv", number = 10) # 10-fold CV
# Training the model using cross-validation
cv_model <- train(Transformed_Weight ~ Length3 + Width + Species,
data = fish_df,
method = "lm",
trControl = train_control)
# Summary of results
print(cv_model)
## Linear Regression
##
## 159 samples
## 3 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 143, 143, 143, 143, 143, 143, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 2.49285 0.9807939 1.758636
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
# Splitting the data into training and testing sets
set.seed(123) # For reproducibility
train_index <- createDataPartition(fish_df$Transformed_Weight, p = 0.8, list = FALSE)
train_data <- fish_df[train_index, ]
test_data <- fish_df[-train_index, ]
# Fitting the model on training data
model_train <- lm(Transformed_Weight ~ Length3 + Width + Species, data = train_data)
# Predicting on testing data
predictions <- predict(model_train, test_data)
# Calculating RMSE on test data
test_rmse <- sqrt(mean((predictions - test_data$Transformed_Weight)^2))
print(paste("Test RMSE:", test_rmse))
## [1] "Test RMSE: 3.63144833568236"
# Calculating R-squared on test data
test_r2 <- summary(lm(predictions ~ test_data$Transformed_Weight))$r.squared
print(paste("Test R-squared:", test_r2))
## [1] "Test R-squared: 0.952502238335418"
Rathod, V. L. (n.d.). Fish Market. Kaggle. Retrieved 2024, from https://www.kaggle.com/datasets/vipullrathod/fish-market