Loading the required packages:

Introduction

The Fish Market dataset offers a valuable resource for examining the relationship between the physical attributes of various fish species and their corresponding weights. Accurate prediction of fish weight based on non-invasive measurements is critical for the fisheries industry, affecting everything from stock management to ecological studies and commercial pricing strategies. However, capturing the complexities of biological variations across different species using standard linear models can be challenging due to the nonlinear nature of biological data.

This study seeks to address these challenges by employing a combination of linear, Box-Cox transformation, logarithimic transformation and weighted least square techniques to find out the best predictive models. By doing so, it aims to not only enhance the predictive accuracy but also provide insights into the factors that most significantly influence fish weight. These insights are crucial for stakeholders in fisheries science, providing a basis for more informed decision-making and contributing to the broader field of aquatic life studies.

Dataset sourced from - Fish market. (n.d.). Retrieved June 13, 2024, from https://www.kaggle.com/datasets/vipullrathod/fish-market

Exploratory Data Analysis (EDA):

fish_df <- read.csv("Fish.csv")
head(fish_df)
##   Species Weight Length1 Length2 Length3  Height  Width
## 1   Bream    242    23.2    25.4    30.0 11.5200 4.0200
## 2   Bream    290    24.0    26.3    31.2 12.4800 4.3056
## 3   Bream    340    23.9    26.5    31.1 12.3778 4.6961
## 4   Bream    363    26.3    29.0    33.5 12.7300 4.4555
## 5   Bream    430    26.5    29.0    34.0 12.4440 5.1340
## 6   Bream    450    26.8    29.7    34.7 13.6024 4.9274
#summary of the data
summary(fish_df)
##    Species              Weight          Length1         Length2     
##  Length:159         Min.   :   0.0   Min.   : 7.50   Min.   : 8.40  
##  Class :character   1st Qu.: 120.0   1st Qu.:19.05   1st Qu.:21.00  
##  Mode  :character   Median : 273.0   Median :25.20   Median :27.30  
##                     Mean   : 398.3   Mean   :26.25   Mean   :28.42  
##                     3rd Qu.: 650.0   3rd Qu.:32.70   3rd Qu.:35.50  
##                     Max.   :1650.0   Max.   :59.00   Max.   :63.40  
##     Length3          Height           Width      
##  Min.   : 8.80   Min.   : 1.728   Min.   :1.048  
##  1st Qu.:23.15   1st Qu.: 5.945   1st Qu.:3.386  
##  Median :29.40   Median : 7.786   Median :4.248  
##  Mean   :31.23   Mean   : 8.971   Mean   :4.417  
##  3rd Qu.:39.65   3rd Qu.:12.366   3rd Qu.:5.585  
##  Max.   :68.00   Max.   :18.957   Max.   :8.142
# Structure of the data
str(fish_df)
## 'data.frame':    159 obs. of  7 variables:
##  $ Species: chr  "Bream" "Bream" "Bream" "Bream" ...
##  $ Weight : num  242 290 340 363 430 450 500 390 450 500 ...
##  $ Length1: num  23.2 24 23.9 26.3 26.5 26.8 26.8 27.6 27.6 28.5 ...
##  $ Length2: num  25.4 26.3 26.5 29 29 29.7 29.7 30 30 30.7 ...
##  $ Length3: num  30 31.2 31.1 33.5 34 34.7 34.5 35 35.1 36.2 ...
##  $ Height : num  11.5 12.5 12.4 12.7 12.4 ...
##  $ Width  : num  4.02 4.31 4.7 4.46 5.13 ...
#finding null values
sum(is.na(fish_df))
## [1] 0
#Detecting special values
is.special <- function(x){ if (is.numeric(x)) (is.infinite(x) | is.nan(x))}
sapply(fish_df, function(x) sum( is.special(x)))
## Species  Weight Length1 Length2 Length3  Height   Width 
##       0       0       0       0       0       0       0

Data Visualization

# Convert data from wide to long format
fish_long <- pivot_longer(fish_df, cols = c(Length1, Length2, Length3), names_to = "LengthType", values_to = "Value")

# Create a box plot comparing Length1, Length2, and Length3 across Species with consistent colors for each species
ggplot(fish_long, aes(x = Species, y = Value, fill = Species)) +
  geom_boxplot() +
  facet_wrap(~ LengthType, scales = "free_y") +
  scale_fill_brewer(palette = "Set3", name = "Species") +  # Using a Brewer palette for distinct colors per species
  labs(title = "Comparison of Fish Lengths by Species",
       x = "Species",
       y = "Length (cm)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Adjust text angle for better readability

ggplot(fish_df, aes(x = Species, y = Weight, color = Species)) +
  geom_jitter(width = 0.2, alpha = 0.6, size = 2) +
  labs(title = "Jitter Plot of Weight by Species", x = "Species", y = "Weight (g)") +
  theme_minimal() +
  scale_color_brewer(palette = "Set1")

ggplot(fish_df, aes(x = Length3, y = Weight, color = Species)) +
  geom_point() +
  labs(title = "Scatter plot of Weight vs. Length3 by Species", x = "Length3 (cm)", y = "Weight (g)") +
  theme_minimal() +
  scale_color_brewer(palette="Set1")

Multiple Regression Estimation

Defining functions for adequacy checks

check_model_adequacy <- function(model) {
  # Summary of the model
  print(summary(model))
  
  # Diagnostic Plots
  par(mfrow = c(2, 2))
  plot(model)
  
  # ANOVA table
  anova_results <- anova(model)
  print(anova_results)
  
  # Non-Constant Variance Test
  ncv_test <- ncvTest(model)
  print(ncv_test)
  
  # Conditionally performing Durbin-Watson test if model is not weighted
  if (is.null(weights(model))) {
    dw_test <- dwtest(model)
    print(dw_test)
  } else {
    cat("Skipping Durbin-Watson test: weighted regressions are not supported.\n")
  }
  
  # Shapiro-Wilk Normality Test of Residuals
  shapiro_test <- shapiro.test(residuals(model))
  print(shapiro_test)
  
  # Conditional Component + Residual Plot
  if (!any(grepl(":", names(coef(model))))) {  # Check if interaction terms exist
    crPlots(model)
  } else {
    cat("Skipping C+R plots due to interaction terms. Using termplot instead.\n")
    termplot(model, partial.resid = TRUE, se = TRUE)
  }
  
  # Breusch-Pagan test
  bp_test <- bptest(model)
  print(bp_test)
  
  # Variance Inflation Factor
  vif_results <- vif(model)
  print(vif_results)
  
  # Outlier Test
  outliers <- outlierTest(model)
  if (is.null(outliers)) {
    cat("No significant outliers detected.\n")
  } else {
    print(outliers)
  }
  
  # Influence Measures
  influence_measures <- influence.measures(model)
  print(summary(influence_measures))
  
  
  par(mfrow = c(1, 1))
}

Linear Model

# Fitting the linear model
lm_model <- lm(Weight ~ Length1 + Length2 + Length3 + Height + Width + factor(Species), data = fish_df)

Adequacy Check

check_model_adequacy(lm_model)
## 
## Call:
## lm(formula = Weight ~ Length1 + Length2 + Length3 + Height + 
##     Width + factor(Species), data = fish_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -213.18  -53.19  -12.62   36.49  420.82 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -918.3321   127.0831  -7.226  2.5e-11 ***
## Length1                   -80.3030    36.2785  -2.214 0.028403 *  
## Length2                    79.8886    45.7180   1.747 0.082653 .  
## Length3                    32.5354    29.3002   1.110 0.268633    
## Height                      5.2510    13.0560   0.402 0.688128    
## Width                      -0.5154    23.9130  -0.022 0.982832    
## factor(Species)Parkki     164.7227    75.6995   2.176 0.031152 *  
## factor(Species)Perch      137.9489   120.3135   1.147 0.253419    
## factor(Species)Pike      -208.4294   135.3064  -1.540 0.125607    
## factor(Species)Roach      103.0400    91.3084   1.128 0.260954    
## factor(Species)Smelt      446.0733   119.4303   3.735 0.000268 ***
## factor(Species)Whitefish   93.8742    96.6580   0.971 0.333045    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 93.83 on 147 degrees of freedom
## Multiple R-squared:  0.9361, Adjusted R-squared:  0.9313 
## F-statistic: 195.7 on 11 and 147 DF,  p-value: < 2.2e-16

## Analysis of Variance Table
## 
## Response: Weight
##                  Df   Sum Sq  Mean Sq   F value    Pr(>F)    
## Length1           1 16978060 16978060 1928.5523 < 2.2e-16 ***
## Length2           1   235134   235134   26.7091 7.589e-07 ***
## Length3           1    74109    74109    8.4181  0.004287 ** 
## Height            1   619028   619028   70.3160 3.787e-14 ***
## Width             1    18474    18474    2.0985  0.149571    
## factor(Species)   6  1028534   171422   19.4720 < 2.2e-16 ***
## Residuals       147  1294118     8804                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 0.03271351, Df = 1, p = 0.85647
## 
##  Durbin-Watson test
## 
## data:  model
## DW = 0.97285, p-value = 1.826e-13
## alternative hypothesis: true autocorrelation is greater than 0
## 
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model)
## W = 0.93335, p-value = 9.145e-07

## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 16.733, df = 11, p-value = 0.116
## 
##                       GVIF Df GVIF^(1/(2*Df))
## Length1         2360.42508  1       48.584206
## Length2         4307.91811  1       65.634732
## Length3         2076.93715  1       45.573426
## Height            56.20370  1        7.496913
## Width             29.16651  1        5.400602
## factor(Species) 1509.77571  6        1.840388
##     rstudent unadjusted p-value Bonferroni p
## 73  5.041405         1.3492e-06   0.00021452
## 143 3.726958         2.7645e-04   0.04395500
## Potentially influential observations of
##   lm(formula = Weight ~ Length1 + Length2 + Length3 + Height +      Width + factor(Species), data = fish_df) :
## 
##     dfb.1_ dfb.Lng1 dfb.Lng2 dfb.Lng3 dfb.Hght dfb.Wdth dfb.fctr(Spcs)Prk
## 14  -0.14  -0.19     0.22    -0.13     0.24    -0.06     0.13            
## 34   0.03  -0.05     0.10    -0.11     0.07    -0.02    -0.06            
## 36   0.16  -0.05     0.10    -0.10    -0.04     0.05    -0.13            
## 56   0.02   0.06    -0.04    -0.01    -0.02     0.06    -0.02            
## 59   0.02   0.00     0.00    -0.01    -0.01     0.05    -0.01            
## 61  -0.09   0.25    -0.20     0.04     0.09    -0.20     0.05            
## 73   0.61   0.26    -0.03    -0.30     0.08     0.02    -0.49            
## 74   0.01   0.14    -0.19     0.13     0.08    -0.20     0.03            
## 112  0.09   0.04    -0.06    -0.03    -0.14     0.70    -0.05            
## 130  0.00   0.00     0.01    -0.01     0.00     0.00     0.00            
## 131  0.09  -0.02     0.12    -0.14     0.00    -0.06    -0.10            
## 133  0.02   0.01     0.01    -0.03     0.03     0.01    -0.02            
## 143 -0.23  -0.30     0.17     0.26    -0.54    -0.68     0.18            
## 144 -0.19  -0.25     0.14     0.22    -0.44    -0.56     0.15            
## 145 -0.26  -0.41     0.09     0.41    -0.59     0.00     0.27            
##     dfb.fctr(Spcs)Prc dfb.fctr(Spcs)Pk dfb.f(S)R dfb.f(S)S dfb.f(S)W dffit  
## 14   0.12              0.23             0.22      0.19      0.15     -0.54  
## 34  -0.04              0.00            -0.01     -0.02     -0.03      0.16  
## 36  -0.12             -0.05             0.01     -0.13     -0.10      0.62  
## 56  -0.03             -0.02            -0.03     -0.02     -0.16     -0.35  
## 59  -0.02              0.00            -0.02     -0.01      0.02      0.11  
## 61   0.08             -0.02             0.07      0.05      0.41      0.89_*
## 73  -0.20              0.00            -0.34     -0.47     -0.19      1.46_*
## 74   0.16              0.10             0.06      0.01      0.14      0.62  
## 112 -0.10              0.12            -0.12     -0.03     -0.16      0.86_*
## 130 -0.01             -0.01             0.00      0.00      0.00     -0.01  
## 131 -0.11             -0.13            -0.07     -0.09     -0.08     -0.22  
## 133 -0.01              0.03             0.00     -0.01      0.00      0.08  
## 143 -0.03             -0.58            -0.06     -0.04     -0.07      1.83_*
## 144 -0.03             -0.47            -0.05     -0.03     -0.06      1.51_*
## 145  0.03             -0.29            -0.03      0.08     -0.06      1.18_*
##     cov.r   cook.d hat    
## 14   0.73_*  0.02   0.05  
## 34   1.25_*  0.00   0.14  
## 36   0.72_*  0.03   0.06  
## 56   1.27_*  0.01   0.18  
## 59   1.40_*  0.00   0.23  
## 61   1.05    0.07   0.20  
## 73   0.17_*  0.15   0.08  
## 74   0.69_*  0.03   0.06  
## 112  0.96    0.06   0.17  
## 130  1.41_*  0.00   0.23_*
## 131  1.37_*  0.00   0.22  
## 133  1.25_*  0.00   0.13  
## 143  0.45_*  0.26   0.19  
## 144  0.64_*  0.18   0.19  
## 145  0.92    0.11   0.22  
##           dfb.1_      dfb.Lng1     dfb.Lng2     dfb.Lng3      dfb.Hght
## 14  -0.141616588 -0.1929315429  0.218982889 -0.131860707  0.2407305160
## 34   0.030440560 -0.0462153032  0.104886195 -0.108042122  0.0696512072
## 36   0.163232617 -0.0488938639  0.098306132 -0.101279759 -0.0426674983
## 56   0.022290078  0.0574691689 -0.038288970 -0.009655143 -0.0230484924
## 59   0.018040929 -0.0016477278  0.004612943 -0.010384556 -0.0137258729
## 61  -0.086555887  0.2532906094 -0.201878871  0.041163663  0.0890340521
## 73   0.606434482  0.2558296423 -0.028738594 -0.295617768  0.0763125406
## 74   0.006242169  0.1414483779 -0.193851039  0.130907288  0.0819001657
## 112  0.089522514  0.0434279233 -0.060712630 -0.028098785 -0.1363368027
## 130  0.004312081 -0.0003973092  0.005154002 -0.006778055 -0.0001156042
## 131  0.086122649 -0.0186653860  0.118213300 -0.144744712  0.0036573538
## 133  0.016359600  0.0097792744  0.010388821 -0.032783338  0.0327090729
## 143 -0.227460671 -0.3049476308  0.172959832  0.263531536 -0.5352820604
## 144 -0.186973292 -0.2506677840  0.142173453  0.216623641 -0.4400033133
## 145 -0.255315561 -0.4107836183  0.089871107  0.414278901 -0.5945522588
##         dfb.Wdth dfb.fctr(Spcs)Prk dfb.fctr(Spcs)Prc dfb.fctr(Spcs)Pk
## 14  -0.057005595       0.127634525       0.123759382      0.228136901
## 34  -0.023864059      -0.055407351      -0.044044944     -0.004707184
## 36   0.049965389      -0.134595482      -0.116295411     -0.051909245
## 56   0.056696904      -0.019612601      -0.027971878     -0.018977680
## 59   0.053373569      -0.013659389      -0.019010921      0.002721119
## 61  -0.198877437       0.054213874       0.080755059     -0.022709299
## 73   0.017302670      -0.488796208      -0.200666725      0.004777082
## 74  -0.202101728       0.026201588       0.164585525      0.101960252
## 112  0.704040862      -0.051732791      -0.103979036      0.119371823
## 130 -0.002100050      -0.004895387      -0.005206735     -0.006101511
## 131 -0.057172961      -0.100231704      -0.105869386     -0.128582745
## 133  0.013880589      -0.018371572      -0.008186606      0.031689176
## 143 -0.675438601       0.183397864      -0.030573190     -0.576548851
## 144 -0.555212371       0.150753544      -0.025131245     -0.473924727
## 145  0.002653725       0.272890884       0.026155515     -0.289650884
##         dfb.f(S)R    dfb.f(S)S    dfb.f(S)W        dffit     cov.r       cook.d
## 14   0.2180407210  0.189940693  0.150838651 -0.543384478 0.7270559 2.385827e-02
## 34  -0.0111900976 -0.017900539 -0.032191666  0.156793491 1.2508137 2.060631e-03
## 36   0.0107610589 -0.134322673 -0.098549398  0.623309759 0.7193873 3.133267e-02
## 56  -0.0302690499 -0.024269611 -0.161098989 -0.348857905 1.2669205 1.017294e-02
## 59  -0.0178246711 -0.012409254  0.018512141  0.112703395 1.3962362 1.065436e-03
## 61   0.0698430795  0.047469008  0.407856443  0.894170047 1.0545803 6.567149e-02
## 73  -0.3429270599 -0.466616754 -0.188641465  1.457103401 0.1714125 1.517281e-01
## 74   0.0593840165  0.010114573  0.135172161  0.618633448 0.6891464 3.076807e-02
## 112 -0.1224779451 -0.032814716 -0.164170353  0.863355569 0.9605187 6.097730e-02
## 130 -0.0037926953 -0.004482028 -0.004209055 -0.009972483 1.4071660 8.344279e-06
## 131 -0.0741396186 -0.089921539 -0.084324939 -0.221023729 1.3697819 4.093953e-03
## 133 -0.0007786165 -0.005038435 -0.003168836  0.080498948 1.2453740 5.435461e-04
## 143 -0.0632140761 -0.036780579 -0.068403619  1.832225575 0.4528524 2.572007e-01
## 144 -0.0519621429 -0.030233736 -0.056227961  1.506094419 0.6381089 1.788258e-01
## 145 -0.0303588844  0.084209296 -0.063033399  1.181811945 0.9198758 1.132597e-01
##            hat
## 14  0.05004871
## 34  0.14273832
## 36  0.06182012
## 56  0.18111993
## 59  0.22543609
## 61  0.20285900
## 73  0.07709637
## 74  0.05666553
## 112 0.16605984
## 130 0.22871224
## 131 0.21890489
## 133 0.13154547
## 143 0.19464247
## 144 0.19464247
## 145 0.21622470

Log Transformed Model

# Determining the minimum value in the response variable
min_value <- min(fish_df$Weight)

# If the minimum value is less than or equal to zero, adjusting the data
if(min_value <= 0) {
    offset <- abs(min_value) + 1  # Ensuring all values are positive
    fish_df$Weight_adjusted <- log(fish_df$Weight + offset)
} else {
    fish_df$Weight_adjusted <- log(fish_df$Weight)
}

# Refitting the model with the log-transformed variable
lm_model_log <- lm(Weight_adjusted ~ Length1 + Length2 + Length3 + Height + Width + factor(Species), data = fish_df)

Adequacy Check

check_model_adequacy(lm_model_log)
## 
## Call:
## lm(formula = Weight_adjusted ~ Length1 + Length2 + Length3 + 
##     Height + Width + factor(Species), data = fish_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.4730 -0.0723  0.0416  0.1561  0.4420 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               2.10901    0.58563   3.601 0.000432 ***
## Length1                   0.07793    0.16718   0.466 0.641796    
## Length2                  -0.23099    0.21068  -1.096 0.274687    
## Length3                   0.20581    0.13502   1.524 0.129582    
## Height                    0.03297    0.06017   0.548 0.584559    
## Width                     0.21191    0.11020   1.923 0.056406 .  
## factor(Species)Parkki     0.38596    0.34884   1.106 0.270353    
## factor(Species)Perch      0.44599    0.55443   0.804 0.422465    
## factor(Species)Pike       0.09975    0.62352   0.160 0.873115    
## factor(Species)Roach      0.00239    0.42077   0.006 0.995476    
## factor(Species)Smelt     -0.81897    0.55036  -1.488 0.138879    
## factor(Species)Whitefish  0.46379    0.44542   1.041 0.299479    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4324 on 147 degrees of freedom
## Multiple R-squared:  0.907,  Adjusted R-squared:    0.9 
## F-statistic: 130.3 on 11 and 147 DF,  p-value: < 2.2e-16

## Analysis of Variance Table
## 
## Response: Weight_adjusted
##                  Df  Sum Sq Mean Sq   F value    Pr(>F)    
## Length1           1 218.409 218.409 1168.2779 < 2.2e-16 ***
## Length2           1  21.874  21.874  117.0029 < 2.2e-16 ***
## Length3           1   1.815   1.815    9.7096  0.002205 ** 
## Height            1  12.469  12.469   66.6989 1.323e-13 ***
## Width             1   3.491   3.491   18.6738 2.842e-05 ***
## factor(Species)   6   9.845   1.641    8.7772 3.547e-08 ***
## Residuals       147  27.482   0.187                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 31.67918, Df = 1, p = 1.8186e-08
## 
##  Durbin-Watson test
## 
## data:  model
## DW = 1.8983, p-value = 0.08929
## alternative hypothesis: true autocorrelation is greater than 0
## 
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model)
## W = 0.43, p-value < 2.2e-16

## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 9.5386, df = 11, p-value = 0.5723
## 
##                       GVIF Df GVIF^(1/(2*Df))
## Length1         2360.42508  1       48.584206
## Length2         4307.91811  1       65.634732
## Length3         2076.93715  1       45.573426
## Height            56.20370  1        7.496913
## Width             29.16651  1        5.400602
## factor(Species) 1509.77571  6        1.840388
##     rstudent unadjusted p-value Bonferroni p
## 41 -22.48584         2.8169e-49   4.4789e-47
## Potentially influential observations of
##   lm(formula = Weight_adjusted ~ Length1 + Length2 + Length3 +      Height + Width + factor(Species), data = fish_df) :
## 
##     dfb.1_  dfb.Lng1 dfb.Lng2 dfb.Lng3 dfb.Hght dfb.Wdth dfb.fctr(Spcs)Prk
## 30   0.03    0.02    -0.02     0.00    -0.04     0.01    -0.02            
## 34  -0.03    0.05    -0.11     0.11    -0.07     0.02     0.06            
## 41  -1.28_* -0.37    -1.15_*   2.23_*  -1.23_*   0.32     1.45_*          
## 56  -0.01   -0.02     0.01     0.00     0.01    -0.02     0.01            
## 57  -0.01   -0.01     0.01     0.01     0.00    -0.01     0.01            
## 58   0.00    0.00     0.00     0.00     0.00     0.00     0.00            
## 59  -0.02    0.00     0.00     0.01     0.01    -0.05     0.01            
## 60   0.00    0.00     0.00     0.00     0.00     0.00     0.00            
## 61   0.01   -0.03     0.02     0.00    -0.01     0.02    -0.01            
## 73  -0.44   -0.18     0.02     0.21    -0.05    -0.01     0.35            
## 112  0.00    0.00     0.00     0.00     0.00     0.02     0.00            
## 130 -0.02    0.00    -0.02     0.03     0.00     0.01     0.02            
## 131  0.04   -0.01     0.06    -0.07     0.00    -0.03    -0.05            
## 143  0.02    0.03    -0.01    -0.02     0.05     0.06    -0.02            
## 144  0.02    0.03    -0.02    -0.03     0.06     0.07    -0.02            
## 145  0.21    0.33    -0.07    -0.33     0.48     0.00    -0.22            
##     dfb.fctr(Spcs)Prc dfb.fctr(Spcs)Pk dfb.f(S)R dfb.f(S)S dfb.f(S)W dffit  
## 30  -0.02             -0.04            -0.03     -0.03     -0.02     -0.06  
## 34   0.05              0.00             0.01      0.02      0.03     -0.16  
## 41   1.06_*            0.13            -0.60      0.93      0.69     -5.77_*
## 56   0.01              0.01             0.01      0.01      0.06      0.14  
## 57   0.01              0.00             0.00      0.00      0.04      0.09  
## 58   0.00              0.00             0.00      0.00      0.00      0.01  
## 59   0.02              0.00             0.02      0.01     -0.02     -0.12  
## 60   0.00              0.00             0.00      0.00     -0.01     -0.04  
## 61  -0.01              0.00            -0.01     -0.01     -0.04     -0.10  
## 73   0.14              0.00             0.25      0.34      0.14     -1.05_*
## 112  0.00              0.00             0.00      0.00     -0.01      0.03  
## 130  0.02              0.03             0.02      0.02      0.02      0.05  
## 131 -0.05             -0.06            -0.04     -0.04     -0.04     -0.10  
## 143  0.00              0.05             0.01      0.00      0.01     -0.16  
## 144  0.00              0.06             0.01      0.00      0.01     -0.20  
## 145 -0.02              0.23             0.02     -0.07      0.05     -0.95_*
##     cov.r   cook.d hat    
## 30   1.26_*  0.00   0.14  
## 34   1.25_*  0.00   0.14  
## 41   0.00_*  0.63   0.06  
## 56   1.32_*  0.00   0.18  
## 57   1.32_*  0.00   0.18  
## 58   1.31_*  0.00   0.17  
## 59   1.40_*  0.00   0.23  
## 60   1.32_*  0.00   0.18  
## 61   1.36_*  0.00   0.20  
## 73   0.42_*  0.08   0.08  
## 112  1.30_*  0.00   0.17  
## 130  1.41_*  0.00   0.23_*
## 131  1.39_*  0.00   0.22  
## 143  1.34_*  0.00   0.19  
## 144  1.33_*  0.00   0.19  
## 145  1.06    0.07   0.22  
##            dfb.1_      dfb.Lng1      dfb.Lng2      dfb.Lng3      dfb.Hght
## 30   2.580657e-02  0.0223299207 -0.0154863393  0.0014146489 -0.0397794935
## 34  -3.167602e-02  0.0480910000 -0.1091431122  0.1124271259 -0.0724780748
## 41  -1.277672e+00 -0.3652168932 -1.1501951380  2.2307934786 -1.2298503611
## 56  -8.628075e-03 -0.0222452468  0.0148209484  0.0037373265  0.0089216429
## 57  -5.743301e-03 -0.0131788974  0.0050813933  0.0087001318 -0.0027703164
## 58   1.113439e-05 -0.0005213034  0.0003499015  0.0001243448 -0.0001206951
## 59  -1.856940e-02  0.0016959945 -0.0047480692  0.0106887494  0.0141279431
## 60  -2.214342e-03 -0.0043581698  0.0009228821  0.0038357026 -0.0005247825
## 61   9.340263e-03 -0.0273326402  0.0217847893 -0.0044419791 -0.0096076823
## 73  -4.359304e-01 -0.1839010239  0.0206585007  0.2125023894 -0.0548566391
## 112  2.798658e-03  0.0013576464 -0.0018980020 -0.0008784260 -0.0042621696
## 130 -2.015384e-02  0.0018569474 -0.0240888207  0.0316793318  0.0005403120
## 131  4.070483e-02 -0.0088219699  0.0558720924 -0.0684118449  0.0017286042
## 143  1.968342e-02  0.0263887900 -0.0149671623 -0.0228048283  0.0463208907
## 144  2.466371e-02  0.0330656664 -0.0187541451 -0.0285748929  0.0580409758
## 145  2.059868e-01  0.3314173449 -0.0725073794 -0.3342373143  0.4796805964
##          dfb.Wdth dfb.fctr(Spcs)Prk dfb.fctr(Spcs)Prc dfb.fctr(Spcs)Pk
## 30   0.0115717348     -1.620837e-02     -2.356760e-02    -0.0362766148
## 34   0.0248326070      5.765612e-02      4.583255e-02     0.0048982302
## 41   0.3218125404      1.450561e+00      1.055103e+00     0.1274248028
## 56  -0.0219463175      7.591673e-03      1.082739e-02     0.0073459070
## 57  -0.0130984543      6.345264e-03      6.542872e-03    -0.0005500383
## 58  -0.0005679361      2.806242e-05      6.821756e-05    -0.0001620613
## 59  -0.0549370345      1.405951e-02      1.956781e-02    -0.0028008282
## 60  -0.0040868324      2.624618e-03      3.056877e-03     0.0011306529
## 61   0.0214609039     -5.850230e-03     -8.714295e-03     0.0024505650
## 73  -0.0124378813      3.513671e-01      1.442476e-01    -0.0034339660
## 112  0.0220097692     -1.617274e-03     -3.250599e-03     0.0037318093
## 130  0.0098152325      2.288010e-02      2.433528e-02     0.0285172975
## 131 -0.0270221113     -4.737331e-02     -5.003789e-02    -0.0607730861
## 143  0.0584494043     -1.587042e-02      2.645666e-03     0.0498919322
## 144  0.0732382390     -1.988595e-02      3.315070e-03     0.0625155602
## 145 -0.0021410064     -2.201665e-01     -2.110209e-02     0.2336883034
##         dfb.f(S)R     dfb.f(S)S    dfb.f(S)W        dffit        cov.r
## 30  -3.070229e-02 -3.116635e-02 -0.022421058 -0.057170418 1.264957e+00
## 34   1.164426e-02  1.862705e-02  0.033498199 -0.163157120 1.249559e+00
## 41  -5.961052e-01  9.337748e-01  0.688954220 -5.767246675 1.851830e-08
## 56   1.171659e-02  9.394315e-03  0.062358424  0.135036409 1.316471e+00
## 57   4.910575e-03  4.347414e-03  0.039150132  0.088185560 1.323355e+00
## 58   2.135671e-05 -4.216138e-05  0.002893397  0.007203756 1.309492e+00
## 59   1.834681e-02  1.277276e-02 -0.019054415 -0.116004801 1.395939e+00
## 60   2.220594e-03  2.003827e-03 -0.013629016 -0.042296773 1.315864e+00
## 61  -7.536781e-03 -5.122390e-03 -0.044011870 -0.096490068 1.357478e+00
## 73   2.465103e-01  3.354236e-01  0.135603358 -1.047426737 4.183338e-01
## 112 -3.828913e-03 -1.025856e-03 -0.005132304  0.026990275 1.301093e+00
## 130  1.772633e-02  2.094814e-02  0.019672317  0.046609478 1.406358e+00
## 131 -3.504120e-02 -4.250033e-02 -0.039855167 -0.104464204 1.385097e+00
## 143  5.470260e-03  3.182825e-03  0.005919340 -0.158552522 1.336206e+00
## 144  6.854343e-03  3.988142e-03  0.007417048 -0.198669390 1.329716e+00
## 145  2.449334e-02 -6.793947e-02  0.050854905 -0.953477596 1.059399e+00
##           cook.d        hat
## 30  2.742003e-04 0.14335488
## 34  2.231105e-03 0.14273832
## 41  6.252929e-01 0.06172339
## 56  1.529114e-03 0.18111993
## 57  6.523405e-04 0.18219171
## 58  4.354121e-06 0.17117644
## 59  1.128750e-03 0.22543609
## 60  1.500972e-04 0.17574171
## 61  7.809795e-04 0.20285900
## 73  8.445448e-02 0.07709637
## 112 6.112051e-05 0.16605984
## 130 1.822678e-04 0.22871224
## 131 9.153821e-04 0.21890489
## 143 2.107756e-03 0.19464247
## 144 3.307955e-03 0.19464247
## 145 7.459516e-02 0.21622470

Weighted Least Squares Model

# Assuming heteroscedasticity pattern is identified, e.g., variance increasing with a predictor
weights <- 1 / fitted(lm_model_log)^2

# Fitting the WLS model
wls_model <- lm(Weight ~ Length1 + Length2 + Length3 + Height + Width + factor(Species), data = fish_df, weights = weights)

Adequacy Check

check_model_adequacy(wls_model)
## 
## Call:
## lm(formula = Weight ~ Length1 + Length2 + Length3 + Height + 
##     Width + factor(Species), data = fish_df, weights = weights)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -35.466 -11.460  -3.076   7.407  88.865 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -654.91     132.79  -4.932 2.18e-06 ***
## Length1                     16.43      37.30   0.440   0.6604    
## Length2                     -2.27      43.91  -0.052   0.9588    
## Length3                     14.03      29.72   0.472   0.6375    
## Height                      15.18      15.03   1.010   0.3142    
## Width                       13.64      26.78   0.509   0.6112    
## factor(Species)Parkki       58.05      78.68   0.738   0.4618    
## factor(Species)Perch        81.72     123.71   0.661   0.5099    
## factor(Species)Pike       -127.89     148.86  -0.859   0.3917    
## factor(Species)Roach        25.82      97.39   0.265   0.7913    
## factor(Species)Smelt       277.07     124.54   2.225   0.0276 *  
## factor(Species)Whitefish    58.86     102.39   0.575   0.5663    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.27 on 147 degrees of freedom
## Multiple R-squared:  0.9224, Adjusted R-squared:  0.9166 
## F-statistic: 158.9 on 11 and 147 DF,  p-value: < 2.2e-16

## Analysis of Variance Table
## 
## Response: Weight
##                  Df Sum Sq Mean Sq   F value    Pr(>F)    
## Length1           1 532327  532327 1595.4483 < 2.2e-16 ***
## Length2           1     27      27    0.0805 0.7770436    
## Length3           1   5333    5333   15.9846 0.0001007 ***
## Height            1  15423   15423   46.2231 2.465e-10 ***
## Width             1     39      39    0.1163 0.7335544    
## factor(Species)   6  29916    4986   14.9437 2.658e-13 ***
## Residuals       147  49047     334                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 0.7019662, Df = 1, p = 0.40212
## Skipping Durbin-Watson test: weighted regressions are not supported.
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model)
## W = 0.87148, p-value = 1.866e-10

## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 1.1714, df = 11, p-value = 0.9999
## 
##                       GVIF Df GVIF^(1/(2*Df))
## Length1         2798.60652  1       52.901857
## Length2         4557.74577  1       67.511079
## Length3         2502.78976  1       50.027890
## Height            89.50428  1        9.460670
## Width             49.94579  1        7.067233
## factor(Species) 2370.09050  6        1.910867
##     rstudent unadjusted p-value Bonferroni p
## 73  5.920809         2.1953e-08   3.4905e-06
## 143 3.915135         1.3821e-04   2.1975e-02
## Potentially influential observations of
##   lm(formula = Weight ~ Length1 + Length2 + Length3 + Height +      Width + factor(Species), data = fish_df, weights = weights) :
## 
##     dfb.1_  dfb.Lng1 dfb.Lng2 dfb.Lng3 dfb.Hght dfb.Wdth dfb.fctr(Spcs)Prk
## 36   0.21   -0.06     0.10    -0.10    -0.05     0.07    -0.16            
## 56   0.02    0.04    -0.03    -0.01    -0.02     0.04    -0.02            
## 57   0.02    0.05    -0.02    -0.03     0.01     0.05    -0.02            
## 58   0.00    0.02    -0.01    -0.01     0.01     0.03     0.00            
## 59   0.01    0.00     0.00     0.00     0.00     0.02     0.00            
## 73   1.19_*  0.60    -0.11    -0.66     0.19     0.21    -0.99            
## 130  0.07    0.01     0.09    -0.12     0.01    -0.05    -0.09            
## 131  0.12    0.00     0.17    -0.22     0.03    -0.12    -0.15            
## 143 -0.17   -0.29     0.18     0.28    -0.59    -0.63     0.15            
## 144 -0.15   -0.26     0.16     0.25    -0.52    -0.56     0.13            
## 145 -0.21   -0.38     0.09     0.42    -0.62    -0.06     0.24            
## 157 -0.06    0.09    -0.17     0.14    -0.02     0.06     0.08            
##     dfb.fctr(Spcs)Prc dfb.fctr(Spcs)Pk dfb.f(S)R dfb.f(S)S dfb.f(S)W dffit  
## 36  -0.14             -0.04             0.00     -0.16     -0.11      0.84  
## 56  -0.02             -0.01            -0.02     -0.02     -0.15     -0.30  
## 57  -0.02              0.01            -0.01     -0.01     -0.17     -0.36  
## 58   0.00              0.01             0.00      0.01     -0.19     -0.43  
## 59  -0.01              0.00            -0.01      0.00      0.01      0.04  
## 73  -0.48              0.06            -0.66     -0.89     -0.38      2.66_*
## 130 -0.09             -0.11            -0.06     -0.08     -0.07     -0.19  
## 131 -0.15             -0.19            -0.10     -0.13     -0.11     -0.36  
## 143 -0.09             -0.62            -0.14     -0.11     -0.13      1.66_*
## 144 -0.08             -0.55            -0.12     -0.10     -0.12      1.47_*
## 145 -0.02             -0.36            -0.09      0.01     -0.11      1.15_*
## 157  0.08              0.07             0.05      0.04      0.06     -0.27  
##     cov.r   cook.d hat  
## 36   0.70_*  0.06   0.10
## 56   1.35_*  0.01   0.21
## 57   1.33_*  0.01   0.21
## 58   1.25_*  0.02   0.19
## 59   1.36_*  0.00   0.20
## 73   0.10_*  0.48   0.17
## 130  1.36_*  0.00   0.21
## 131  1.29_*  0.01   0.20
## 143  0.39_*  0.21   0.15
## 144  0.50_*  0.17   0.15
## 145  0.65_*  0.11   0.14
## 157  1.26_*  0.01   0.16
##           dfb.1_     dfb.Lng1      dfb.Lng2     dfb.Lng3     dfb.Hght
## 36   0.206052805 -0.056811547  0.1043450978 -0.099558874 -0.050484435
## 56   0.018968422  0.039542021 -0.0274747530 -0.008358706 -0.018873950
## 57   0.020101876  0.045379369 -0.0171413541 -0.034781468  0.014603095
## 58  -0.002116319  0.019007351 -0.0135381706 -0.006686314  0.008369706
## 59   0.005986070  0.001778305 -0.0006142875 -0.003549798 -0.003183238
## 73   1.186158483  0.599659977 -0.1097381731 -0.659959083  0.193781619
## 130  0.073830069  0.010453289  0.0870022745 -0.124905713  0.008653279
## 131  0.120651566  0.001938487  0.1693636484 -0.222417602  0.026855291
## 143 -0.170514830 -0.294142350  0.1769741837  0.284100717 -0.591004853
## 144 -0.150851954 -0.260223397  0.1565664494  0.251339713 -0.522853275
## 145 -0.207167706 -0.379778506  0.0869031996  0.420939330 -0.617639295
## 157 -0.061733866  0.086927432 -0.1738300220  0.138269569 -0.018732609
##        dfb.Wdth dfb.fctr(Spcs)Prk dfb.fctr(Spcs)Prc dfb.fctr(Spcs)Pk
## 36   0.06793502     -0.1640531347     -1.387828e-01      -0.03943022
## 56   0.04497524     -0.0163987627     -2.217259e-02      -0.01438838
## 57   0.05013782     -0.0227499346     -2.018855e-02       0.00904348
## 58   0.02896943      0.0002550724     -2.325441e-05       0.01274061
## 59   0.01965630     -0.0045292588     -5.804950e-03       0.00266487
## 73   0.21116628     -0.9888008438     -4.754802e-01       0.05611569
## 130 -0.05296619     -0.0877934237     -8.735768e-02      -0.10648186
## 131 -0.11700754     -0.1484982666     -1.455975e-01      -0.18657903
## 143 -0.62905977      0.1459852878     -8.747702e-02      -0.61669433
## 144 -0.55651990      0.1291510304     -7.738963e-02      -0.54558037
## 145 -0.05977564      0.2356574591     -2.017776e-02      -0.36007292
## 157  0.06372746      0.0788105324      8.342151e-02       0.07115814
##        dfb.f(S)R    dfb.f(S)S    dfb.f(S)W      dffit      cov.r       cook.d
## 36   0.003456882 -0.163248558 -0.107946546  0.8384556 0.70102278 0.0564031993
## 56  -0.024343166 -0.020021726 -0.149544715 -0.2959972 1.34561131 0.0073350729
## 57  -0.014378770 -0.012574706 -0.171802352 -0.3618836 1.32555490 0.0109517752
## 58   0.002005395  0.005284479 -0.190461954 -0.4323664 1.25395153 0.0155996057
## 59  -0.005478972 -0.003559482  0.009685473  0.0416554 1.36357145 0.0001455814
## 73  -0.662292491 -0.888558402 -0.383354240  2.6646150 0.09866255 0.4803880044
## 130 -0.061981921 -0.078015105 -0.066239224 -0.1942369 1.36228149 0.0031625020
## 131 -0.097866848 -0.128345879 -0.107659848 -0.3613955 1.29324399 0.0109185461
## 143 -0.136521114 -0.113664097 -0.133979231  1.6618256 0.38656774 0.2096990584
## 144 -0.120778215 -0.100556950 -0.118529449  1.4701926 0.49655472 0.1675852695
## 145 -0.090979857  0.010333574 -0.114302323  1.1543806 0.64613216 0.1057662985
## 157  0.045744119  0.042282900  0.062665472 -0.2662829 1.25554002 0.0059343685
##            hat
## 36  0.09517273
## 56  0.21439073
## 57  0.21306457
## 58  0.18939258
## 59  0.20447308
## 73  0.16842554
## 130 0.21238311
## 131 0.19668353
## 143 0.15266292
## 144 0.15266292
## 145 0.13772783
## 157 0.16131194

Box-Cox Transformed Model

# Adding a constant to 'Weight' to make all values positive if necessary
if(any(fish_df$Weight <= 0)) {
  min_positive_value <- abs(min(fish_df$Weight[fish_df$Weight > 0])) 
  adjustment_factor <- min_positive_value + 0.1  # Ensure all values are strictly positive
  fish_df$Adjusted_Weight <- fish_df$Weight + adjustment_factor
} else {
  fish_df$Adjusted_Weight <- fish_df$Weight
}
b<-boxcox(lm(Adjusted_Weight ~ Length1 + Length2 + Length3 + Height + Width + factor(Species), data = fish_df))

lambda <- b$x[which.max(b$y)]
lambda_optimal <- 0.5

# Transforming the 'Weight' using the optimal lambda
fish_df$Transformed_Weight <- (fish_df$Adjusted_Weight^lambda_optimal - 1) / lambda_optimal

# Fitting the model with the transformed 'Weight'
lm_transformed <- lm(Transformed_Weight ~ Length1 + Length2 + Length3 + Height + Width + factor(Species), data = fish_df)

Adequacy Check

check_model_adequacy(lm_transformed)
## 
## Call:
## lm(formula = Transformed_Weight ~ Length1 + Length2 + Length3 + 
##     Height + Width + factor(Species), data = fish_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.7061  -1.2644  -0.0962   1.2055   7.9910 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -23.2723     3.4487  -6.748 3.22e-10 ***
## Length1                    0.2662     0.9845   0.270  0.78728    
## Length2                   -0.4179     1.2407  -0.337  0.73671    
## Length3                    1.3635     0.7951   1.715  0.08848 .  
## Height                     0.8889     0.3543   2.509  0.01320 *  
## Width                      1.9327     0.6489   2.978  0.00339 ** 
## factor(Species)Parkki      4.1837     2.0543   2.037  0.04349 *  
## factor(Species)Perch       5.0973     3.2650   1.561  0.12063    
## factor(Species)Pike       -2.9773     3.6719  -0.811  0.41877    
## factor(Species)Roach       2.0657     2.4779   0.834  0.40582    
## factor(Species)Smelt       9.1688     3.2410   2.829  0.00532 ** 
## factor(Species)Whitefish   5.2000     2.6231   1.982  0.04930 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.546 on 147 degrees of freedom
## Multiple R-squared:  0.9815, Adjusted R-squared:  0.9801 
## F-statistic: 709.6 on 11 and 147 DF,  p-value: < 2.2e-16

## Analysis of Variance Table
## 
## Response: Transformed_Weight
##                  Df Sum Sq Mean Sq  F value    Pr(>F)    
## Length1           1  45398   45398 7002.208 < 2.2e-16 ***
## Length2           1   1558    1558  240.365 < 2.2e-16 ***
## Length3           1    524     524   80.746 1.155e-15 ***
## Height            1   2469    2469  380.769 < 2.2e-16 ***
## Width             1    267     267   41.112 1.846e-09 ***
## factor(Species)   6    389      65   10.004 2.967e-09 ***
## Residuals       147    953       6                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 0.3102006, Df = 1, p = 0.57756
## 
##  Durbin-Watson test
## 
## data:  model
## DW = 1.7426, p-value = 0.009606
## alternative hypothesis: true autocorrelation is greater than 0
## 
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model)
## W = 0.86611, p-value = 1.023e-10

## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 7.12, df = 11, p-value = 0.7893
## 
##                       GVIF Df GVIF^(1/(2*Df))
## Length1         2360.42508  1       48.584206
## Length2         4307.91811  1       65.634732
## Length3         2076.93715  1       45.573426
## Height            56.20370  1        7.496913
## Width             29.16651  1        5.400602
## factor(Species) 1509.77571  6        1.840388
##    rstudent unadjusted p-value Bonferroni p
## 41 -7.45782         7.2149e-12   1.1472e-09
## 14 -3.80379         2.0891e-04   3.3217e-02
## Potentially influential observations of
##   lm(formula = Transformed_Weight ~ Length1 + Length2 + Length3 +      Height + Width + factor(Species), data = fish_df) :
## 
##     dfb.1_ dfb.Lng1 dfb.Lng2 dfb.Lng3 dfb.Hght dfb.Wdth dfb.fctr(Spcs)Prk
## 14  -0.23  -0.31     0.35    -0.21     0.39    -0.09     0.21            
## 41  -0.42  -0.12    -0.38     0.74    -0.41     0.11     0.48            
## 56   0.01   0.03    -0.02    -0.01    -0.01     0.03    -0.01            
## 57   0.02   0.04    -0.02    -0.03     0.01     0.04    -0.02            
## 59  -0.02   0.00     0.00     0.01     0.01    -0.05     0.01            
## 73   0.41   0.17    -0.02    -0.20     0.05     0.01    -0.33            
## 130 -0.04   0.00    -0.04     0.06     0.00     0.02     0.04            
## 131  0.14  -0.03     0.19    -0.23     0.01    -0.09    -0.16            
## 143 -0.19  -0.25     0.14     0.22    -0.45    -0.56     0.15            
## 144 -0.15  -0.21     0.12     0.18    -0.36    -0.46     0.12            
## 145  0.02   0.03    -0.01    -0.03     0.04     0.00    -0.02            
##     dfb.fctr(Spcs)Prc dfb.fctr(Spcs)Pk dfb.f(S)R dfb.f(S)S dfb.f(S)W dffit  
## 14   0.20              0.37             0.35      0.31      0.24     -0.87_*
## 41   0.35              0.04            -0.20      0.31      0.23     -1.91_*
## 56  -0.01             -0.01            -0.02     -0.01     -0.09     -0.18  
## 57  -0.02              0.00            -0.02     -0.01     -0.13     -0.30  
## 59   0.02              0.00             0.02      0.01     -0.02     -0.10  
## 73  -0.13              0.00            -0.23     -0.31     -0.13      0.98_*
## 130  0.05              0.05             0.03      0.04      0.04      0.09  
## 131 -0.17             -0.20            -0.12     -0.14     -0.13     -0.35  
## 143 -0.03             -0.48            -0.05     -0.03     -0.06      1.53_*
## 144 -0.02             -0.39            -0.04     -0.02     -0.05      1.23_*
## 145  0.00              0.02             0.00     -0.01      0.00     -0.08  
##     cov.r   cook.d hat    
## 14   0.37_*  0.06   0.05  
## 41   0.02_*  0.22   0.06  
## 56   1.31_*  0.00   0.18  
## 57   1.29_*  0.01   0.18  
## 59   1.40_*  0.00   0.23  
## 73   0.48_*  0.07   0.08  
## 130  1.40_*  0.00   0.23_*
## 131  1.34_*  0.01   0.22  
## 143  0.63_*  0.18   0.19  
## 144  0.81    0.12   0.19  
## 145  1.38_*  0.00   0.22  
##          dfb.1_     dfb.Lng1     dfb.Lng2     dfb.Lng3     dfb.Hght
## 14  -0.22754600 -0.309997587  0.351856239 -0.211870492  0.386799783
## 41  -0.42376205 -0.121130551 -0.381482275  0.739881558 -0.407901318
## 56   0.01181730  0.030467835 -0.020299268 -0.005118768 -0.012219381
## 57   0.01923502  0.044137741 -0.017018209 -0.029137807  0.009278129
## 59  -0.01603207  0.001464253 -0.004099291  0.009228234  0.012197495
## 73   0.40666693  0.171555968 -0.019271720 -0.198237358  0.051174178
## 130 -0.03736911  0.003443138 -0.044665317  0.058739587  0.001001843
## 131  0.13691200 -0.029672976  0.187927557 -0.230105413  0.005814215
## 143 -0.18952056 -0.254082801  0.144110379  0.219574852 -0.445997777
## 144 -0.15329102 -0.205511277  0.116561640  0.177600011 -0.360738988
## 145  0.01791805  0.028828798 -0.006307155 -0.029074097  0.041725683
##          dfb.Wdth dfb.fctr(Spcs)Prk dfb.fctr(Spcs)Prc dfb.fctr(Spcs)Pk
## 14  -0.0915951668        0.20507997       0.198853485      0.366564678
## 41   0.1067347408        0.48110370       0.349943340      0.042262658
## 56   0.0300584119       -0.01039781      -0.014829562     -0.010061200
## 57   0.0438683277       -0.02125107      -0.021912879      0.001842146
## 59  -0.0474304148        0.01213841       0.016894052     -0.002418122
## 73   0.0116029412       -0.32778026      -0.134564447      0.003203448
## 130  0.0181993330        0.04242412       0.045122306      0.052876566
## 131 -0.0908897293       -0.15934163      -0.168304032     -0.204412205
## 143 -0.5627764062        0.15280736      -0.025473625     -0.480381325
## 144 -0.4551937286        0.12359607      -0.020603981     -0.388549634
## 145 -0.0001862384       -0.01915148      -0.001835594      0.020327702
##        dfb.f(S)R    dfb.f(S)S    dfb.f(S)W       dffit      cov.r       cook.d
## 14   0.350342387  0.305191964  0.242363778 -0.87309661 0.36763669 0.0581929058
## 41  -0.197708700  0.309702684  0.228503681 -1.91280793 0.02404903 0.2223039518
## 56  -0.016047429 -0.012866769 -0.085408187 -0.18495039 1.30868729 0.0028670413
## 57  -0.016446116 -0.014560023 -0.131118587 -0.29534424 1.28518200 0.0072992311
## 59   0.015839891  0.011027482 -0.016450812 -0.10015386 1.39728958 0.0008414263
## 73  -0.229962342 -0.312907011 -0.126500466  0.97711423 0.47602589 0.0742919358
## 130  0.032868038  0.038841896  0.036476268  0.08642296 1.40429487 0.0006265656
## 131 -0.117862182 -0.142951218 -0.134054120 -0.35136867 1.34019696 0.0103276345
## 143 -0.052670058 -0.030645631 -0.056993993  1.52661296 0.62555649 0.1834275580
## 144 -0.042601431 -0.024787285 -0.046098784  1.23477928 0.81119970 0.1226282314
## 145  0.002130587 -0.005909809  0.004423685 -0.08293957 1.38194953 0.0005770755
##            hat
## 14  0.05004871
## 41  0.06172339
## 56  0.18111993
## 57  0.18219171
## 59  0.22543609
## 73  0.07709637
## 130 0.22871224
## 131 0.21890489
## 143 0.19464247
## 144 0.19464247
## 145 0.21622470

Defining a function to compare various models

# Defining a list of models
models <- list(
  lm_model = lm_model,#linear model
  lm_model_log = lm_model_log, #log transformed model
  wls_model = wls_model, #wls model
  lm_transformed = lm_transformed #box cox transformed model
)

# Setup for comparison
model_comparisons <- data.frame(
  Model = character(),
  RSS = numeric(),
  R2 = numeric(),
  AdjR2 = numeric(),
  Cp = numeric(),
  BIC = numeric(),
  stringsAsFactors = FALSE
)


mse_full <- sum(resid(lm_model)^2) / lm_model$df.residual


for (model_name in names(models)) {
  model <- models[[model_name]]
  model_summary <- summary(model)
  rss <- sum(resid(model)^2)
  r2 <- model_summary$r.squared
  adj_r2 <- model_summary$adj.r.squared
  n <- nrow(fish_df)  # Number of observations
  p <- length(coef(model))  # Number of predictors in the model
  cp <- (rss / mse_full) - (n - 2 * p)  # Calculating Cp
  bic <- AIC(model, k = log(n))  # Using AIC function to compute BIC approximation

  model_comparisons <- rbind(model_comparisons, data.frame(
    Model = model_name,
    RSS = rss,
    R2 = r2,
    AdjR2 = adj_r2,
    Cp = cp,
    BIC = bic
  ))
}

# Print the model comparisons
print(model_comparisons)
##            Model          RSS        R2     AdjR2         Cp       BIC
## 1       lm_model 1.294118e+06 0.9360849 0.9313021   12.00000 1948.8235
## 2   lm_model_log 2.748158e+01 0.9069636 0.9000017 -134.99688  238.0115
## 3      wls_model 1.596616e+06 0.9224076 0.9166013   46.36094 1952.6118
## 4 lm_transformed 9.530486e+02 0.9815146 0.9801313 -134.89174  801.8493
# Optionally, identify best model by criteria
best_by_adj_r2 <- model_comparisons[which.max(model_comparisons$AdjR2),]
print(paste("Best model by Adjusted R2:", best_by_adj_r2$Model))
## [1] "Best model by Adjusted R2: lm_transformed"

Subset regression for Variable Selection

# Perform subset selection using regsubsets
subset_selection <- leaps::regsubsets(Transformed_Weight ~ Length1 + Length2 + Length3 + Height + Width + factor(Species), 
                                      data = fish_df, 
                                      nbest = 1,# To keep the best model of each size
                                      really.big = TRUE,
                                      method = "forward") 

# Summarize the results
subset_summary <- summary(subset_selection)

Below plots were generated to visually assess the model’s performance based on Mallows’ Cp, BIC, and Adjusted R-squared criteria, aiding in the selection of the best subset of predictors.

plot(subset_selection,scale="Cp")

plot(subset_selection,scale="bic")

plot(subset_selection,scale="adjr2")

#### Evaluating Model Selection Criteria

RSS = subset_summary$rss
r2 = subset_summary$rsq
Cp = subset_summary$cp
BIC = subset_summary$bic
Adj_r2 = subset_summary$adjr2

cbind(RSS, r2, Cp, BIC, Adj_r2)
##            RSS        r2         Cp       BIC    Adj_r2
## [1,] 4556.0198 0.9116311 547.728996 -375.6337 0.9110683
## [2,] 1512.2457 0.9706684  80.251593 -545.9185 0.9702923
## [3,] 1305.3906 0.9746806  50.345877 -564.2375 0.9741905
## [4,] 1126.7396 0.9781457  24.790423 -582.5692 0.9775780
## [5,] 1044.5853 0.9797392  14.118792 -589.5379 0.9790770
## [6,] 1000.8034 0.9805884   9.365790 -591.2769 0.9798221
## [7,]  990.7434 0.9807835   9.814102 -587.8144 0.9798926
## [8,]  980.6719 0.9809788  10.260667 -584.3700 0.9799644

Plots to show variable counts for each model

plot(Cp, xlab = "Number of Predictors", ylab = "Cp", type = 'l', lwd = 2)
points(which.min(Cp), Cp[which.min(Cp)], col = "red", cex = 2, pch = 8, lwd = 2)

plot(BIC, xlab = "Number of Predictors", ylab = "BIC", type = 'l', lwd = 2)
points(which.min(BIC), BIC[which.min(BIC)], col = "red", cex = 2, pch = 8, lwd = 2)

plot(Adj_r2, xlab = "Number of Predictors", ylab = "Adjusted R-squared", type = 'l', lwd = 2)
points(which.max(Adj_r2), Adj_r2[which.max(Adj_r2)], col = "red", cex = 2, pch = 8, lwd = 2)

#### Optimal coefficients for each criterion

coef(subset_selection, which.min(Cp))
##           (Intercept)               Length3                 Width 
##            -18.700956              1.320857              2.799469 
## factor(Species)Parkki   factor(Species)Pike  factor(Species)Roach 
##              2.241776            -10.724132             -2.473789 
##  factor(Species)Smelt 
##              3.970063
coef(subset_selection, which.min(BIC))
##           (Intercept)               Length3                 Width 
##            -18.700956              1.320857              2.799469 
## factor(Species)Parkki   factor(Species)Pike  factor(Species)Roach 
##              2.241776            -10.724132             -2.473789 
##  factor(Species)Smelt 
##              3.970063
coef(subset_selection, which.max(Adj_r2))
##           (Intercept)               Length2               Length3 
##           -18.3413926             0.6423043             0.6073081 
##                Height                 Width factor(Species)Parkki 
##             0.4622490             2.6076994             1.5514236 
##   factor(Species)Pike  factor(Species)Roach  factor(Species)Smelt 
##            -8.1250754            -1.7165739             4.4906379
# Extracting the best set of predictors based on BIC
best_model_vars_bic <- names(coef(subset_selection, which.min(subset_summary$bic)))
best_model_vars_bic
## [1] "(Intercept)"           "Length3"               "Width"                
## [4] "factor(Species)Parkki" "factor(Species)Pike"   "factor(Species)Roach" 
## [7] "factor(Species)Smelt"
# Extracting the best set of predictors based on cp
best_model_vars_cp <- names(coef(subset_selection, which.min(subset_summary$cp)))
best_model_vars_cp
## [1] "(Intercept)"           "Length3"               "Width"                
## [4] "factor(Species)Parkki" "factor(Species)Pike"   "factor(Species)Roach" 
## [7] "factor(Species)Smelt"
# Extracting the best set of predictors based on adjr2
best_model_vars_adjr2 <- names(coef(subset_selection, which.max(subset_summary$adjr2)))
best_model_vars_adjr2
## [1] "(Intercept)"           "Length2"               "Length3"              
## [4] "Height"                "Width"                 "factor(Species)Parkki"
## [7] "factor(Species)Pike"   "factor(Species)Roach"  "factor(Species)Smelt"

Model Fitting Choosing the best model and variables

best_subset_model_bic <- lm(Transformed_Weight ~ Length3 + Width + Species, 
                        data = fish_df, 
                        subset = Species %in% c("Parkki", "Pike", "Roach", "Smelt"))
# Check the summary of the model
check_model_adequacy(best_subset_model_bic)
## 
## Call:
## lm(formula = Transformed_Weight ~ Length3 + Width + Species, 
##     data = fish_df, subset = Species %in% c("Parkki", "Pike", 
##         "Roach", "Smelt"))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.5443  -0.8635   0.1469   1.2890   4.2608 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -15.19148    1.72648  -8.799 3.81e-12 ***
## Length3        1.57677    0.16242   9.708 1.32e-13 ***
## Width          0.59493    1.27061   0.468 0.641440    
## SpeciesPike  -15.48805    2.43738  -6.354 4.01e-08 ***
## SpeciesRoach  -4.30959    1.07258  -4.018 0.000177 ***
## SpeciesSmelt   0.07884    1.55800   0.051 0.959823    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.771 on 56 degrees of freedom
## Multiple R-squared:  0.9798, Adjusted R-squared:  0.978 
## F-statistic: 542.9 on 5 and 56 DF,  p-value: < 2.2e-16

## Analysis of Variance Table
## 
## Response: Transformed_Weight
##           Df  Sum Sq Mean Sq  F value    Pr(>F)    
## Length3    1 20269.3 20269.3 2639.182 < 2.2e-16 ***
## Width      1   153.3   153.3   19.957 3.895e-05 ***
## Species    3   427.0   142.3   18.531 1.791e-08 ***
## Residuals 56   430.1     7.7                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 0.02888963, Df = 1, p = 0.86503
## 
##  Durbin-Watson test
## 
## data:  model
## DW = 2.0288, p-value = 0.3402
## alternative hypothesis: true autocorrelation is greater than 0
## 
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model)
## W = 0.73116, p-value = 2.479e-09

## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 2.3849, df = 5, p-value = 0.7937
## 
##             GVIF Df GVIF^(1/(2*Df))
## Length3 44.82414  1        6.695083
## Width   30.34475  1        5.508607
## Species 25.54705  3        1.716156
##     rstudent unadjusted p-value Bonferroni p
## 41 -8.952754         2.5149e-12   1.5593e-10
## Potentially influential observations of
##   lm(formula = Transformed_Weight ~ Length3 + Width + Species,      data = fish_df, subset = Species %in% c("Parkki", "Pike",          "Roach", "Smelt")) :
## 
##     dfb.1_ dfb.Lng3 dfb.Wdth dfb.SpcP dfb.SpcR dfb.SpcS dffit   cov.r   cook.d
## 41  -0.42  -0.03     0.20    -0.14    -1.28_*   0.28    -2.11_*  0.01_*  0.31 
## 142 -0.16  -0.09     0.15     0.05    -0.05     0.14     0.24    1.34_*  0.01 
## 143 -0.29   0.98    -0.77    -0.80     0.07    -0.18     1.19_*  1.09    0.22 
## 145  0.26  -0.08    -0.04     0.13     0.05    -0.14    -0.36    1.39_*  0.02 
##     hat  
## 41   0.05
## 142  0.19
## 143  0.29
## 145  0.23
##         dfb.1_    dfb.Lng3    dfb.Wdth    dfb.SpcP    dfb.SpcR   dfb.SpcS
## 41  -0.4171618 -0.03093509  0.20397593 -0.14493410 -1.28373477  0.2813857
## 142 -0.1627784 -0.09302800  0.15282140  0.04966278 -0.04843667  0.1397822
## 143 -0.2857662  0.98235123 -0.76801878 -0.80252811  0.07354258 -0.1789287
## 145  0.2603089 -0.08251684 -0.03518008  0.12650917  0.04544527 -0.1378732
##          dffit       cov.r     cook.d        hat
## 41  -2.1113349 0.005341991 0.30784292 0.05268596
## 142  0.2442338 1.336152489 0.01007557 0.18903818
## 143  1.1863550 1.085055968 0.22466661 0.28860512
## 145 -0.3637708 1.388234000 0.02228065 0.23426752
best_subset_model_adjr2 <- lm(Transformed_Weight ~ Length3 + Length2 + Height + Width, 
                              data = fish_df, 
                              subset = Species %in% c("Parkki", "Pike", "Roach", "Smelt"))
                        
# Check the summary of the model
check_model_adequacy(best_subset_model_adjr2)
## 
## Call:
## lm(formula = Transformed_Weight ~ Length3 + Length2 + Height + 
##     Width, data = fish_df, subset = Species %in% c("Parkki", 
##     "Pike", "Roach", "Smelt"))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.0108  -0.9036   0.6444   1.4476   8.9909 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -11.9869     1.4045  -8.535 8.95e-12 ***
## Length3       0.5554     1.4688   0.378  0.70675    
## Length2       0.5662     1.4524   0.390  0.69810    
## Height        1.0364     0.3663   2.830  0.00643 ** 
## Width         0.2732     1.5208   0.180  0.85808    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.631 on 57 degrees of freedom
## Multiple R-squared:  0.9647, Adjusted R-squared:  0.9622 
## F-statistic: 389.3 on 4 and 57 DF,  p-value: < 2.2e-16

## Analysis of Variance Table
## 
## Response: Transformed_Weight
##           Df  Sum Sq Mean Sq   F value    Pr(>F)    
## Length3    1 20269.3 20269.3 1537.7339 < 2.2e-16 ***
## Length2    1    95.8    95.8    7.2655  0.009222 ** 
## Height     1   162.8   162.8   12.3512  0.000871 ***
## Width      1     0.4     0.4    0.0323  0.858083    
## Residuals 57   751.3    13.2                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 9.747459, Df = 1, p = 0.0017957
## 
##  Durbin-Watson test
## 
## data:  model
## DW = 1.1906, p-value = 8.967e-05
## alternative hypothesis: true autocorrelation is greater than 0
## 
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model)
## W = 0.83149, p-value = 6.499e-07

## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 2.4087, df = 4, p-value = 0.6611
## 
##    Length3    Length2     Height      Width 
## 2135.98235 1884.20734    4.59343   25.32757 
##     rstudent unadjusted p-value Bonferroni p
## 41 -6.093245         1.0715e-07   6.6435e-06
## Potentially influential observations of
##   lm(formula = Transformed_Weight ~ Length3 + Length2 + Height +      Width, data = fish_df, subset = Species %in% c("Parkki",      "Pike", "Roach", "Smelt")) :
## 
##     dfb.1_ dfb.Lng3 dfb.Lng2 dfb.Hght dfb.Wdth dffit   cov.r   cook.d hat  
## 41  -0.46   0.40    -0.35     0.21    -0.70    -1.14_*  0.09_*  0.16   0.03
## 130  0.15  -0.35     0.36     0.16     0.14    -0.39    1.35_*  0.03   0.23
## 143 -0.52   0.35    -0.29     0.27    -0.66     1.27_*  0.66_*  0.29   0.16
## 144 -0.44   0.30    -0.25     0.23    -0.56     1.07_*  0.79    0.21   0.16
## 158  0.00   0.00     0.00     0.00     0.00     0.01    1.27_*  0.00   0.14
##           dfb.1_     dfb.Lng3     dfb.Lng2     dfb.Hght     dfb.Wdth
## 41  -0.462827733  0.404165726 -0.354558300  0.210859814 -0.704806678
## 130  0.149023648 -0.354043037  0.355011501  0.162439298  0.142594827
## 143 -0.517495890  0.354126625 -0.292054438  0.269701309 -0.663340589
## 144 -0.436810677  0.298913081 -0.246518860  0.227650912 -0.559916045
## 158  0.004516853 -0.004303681  0.004209706 -0.001505875  0.003587604
##            dffit      cov.r       cook.d        hat
## 41  -1.142237994 0.08891625 1.597127e-01 0.03394817
## 130 -0.391244056 1.34688132 3.087175e-02 0.22580714
## 143  1.269374354 0.65609075 2.858429e-01 0.16319211
## 144  1.071460241 0.79206102 2.114750e-01 0.16319211
## 158  0.005399422 1.27338555 5.934852e-06 0.14203929

Prediction Error Sum of Squares (PRESS) Calculation

library(DAAG)
## 
## Attaching package: 'DAAG'
## The following object is masked from 'package:MASS':
## 
##     hills
## The following object is masked from 'package:car':
## 
##     vif
press(best_subset_model_bic)
## [1] 518.6548
press(best_subset_model_adjr2)
## [1] 895.466

Training and testing the dataset to see effectiveness of the model on unseen data

library(caret)
## Loading required package: lattice
# Setting up cross-validation
set.seed(123)  # for reproducibility
train_control <- trainControl(method = "cv", number = 10)  # 10-fold CV

# Training the model using cross-validation
cv_model <- train(Transformed_Weight ~ Length3 + Width + Species,
                  data = fish_df,
                  method = "lm",
                  trControl = train_control)

# Summary of results
print(cv_model)
## Linear Regression 
## 
## 159 samples
##   3 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 143, 143, 143, 143, 143, 143, ... 
## Resampling results:
## 
##   RMSE     Rsquared   MAE     
##   2.49285  0.9807939  1.758636
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

Evaluation parameters:

# Splitting the data into training and testing sets
set.seed(123)  # For reproducibility
train_index <- createDataPartition(fish_df$Transformed_Weight, p = 0.8, list = FALSE)
train_data <- fish_df[train_index, ]
test_data <- fish_df[-train_index, ]

# Fitting the model on training data
model_train <- lm(Transformed_Weight ~ Length3 + Width + Species, data = train_data)

# Predicting on testing data
predictions <- predict(model_train, test_data)

# Calculating RMSE on test data
test_rmse <- sqrt(mean((predictions - test_data$Transformed_Weight)^2))
print(paste("Test RMSE:", test_rmse))
## [1] "Test RMSE: 3.63144833568236"
# Calculating R-squared on test data
test_r2 <- summary(lm(predictions ~ test_data$Transformed_Weight))$r.squared
print(paste("Test R-squared:", test_r2))
## [1] "Test R-squared: 0.952502238335418"

Reference

Rathod, V. L. (n.d.). Fish Market. Kaggle. Retrieved 2024, from https://www.kaggle.com/datasets/vipullrathod/fish-market