MLR Test 1

Author

Dr Omar Bin Nazmi

options(repos = c(CRAN = "https://cran.r-project.org"))
install.packages("tidyverse")
Installing package into 'C:/Users/H P/AppData/Local/R/win-library/4.4'
(as 'lib' is unspecified)
package 'tidyverse' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
    C:\Users\H P\AppData\Local\Temp\RtmpOojPuk\downloaded_packages
install.packages("summarytools")
Installing package into 'C:/Users/H P/AppData/Local/R/win-library/4.4'
(as 'lib' is unspecified)
package 'summarytools' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
    C:\Users\H P\AppData\Local\Temp\RtmpOojPuk\downloaded_packages
install.packages("skimr")
Installing package into 'C:/Users/H P/AppData/Local/R/win-library/4.4'
(as 'lib' is unspecified)
package 'skimr' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
    C:\Users\H P\AppData\Local\Temp\RtmpOojPuk\downloaded_packages
install.packages("gtsummary")
Installing package into 'C:/Users/H P/AppData/Local/R/win-library/4.4'
(as 'lib' is unspecified)
package 'gtsummary' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
    C:\Users\H P\AppData\Local\Temp\RtmpOojPuk\downloaded_packages
install.packages("broom.helpers")
Installing package into 'C:/Users/H P/AppData/Local/R/win-library/4.4'
(as 'lib' is unspecified)
package 'broom.helpers' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
    C:\Users\H P\AppData\Local\Temp\RtmpOojPuk\downloaded_packages
library(tidyverse) #data manipulation
Warning: package 'tidyverse' was built under R version 4.4.2
Warning: package 'dplyr' was built under R version 4.4.2
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(summarytools) #exploration
Warning: package 'summarytools' was built under R version 4.4.2

Attaching package: 'summarytools'

The following object is masked from 'package:tibble':

    view
library(skimr)
Warning: package 'skimr' was built under R version 4.4.2
library(gtsummary)
Warning: package 'gtsummary' was built under R version 4.4.2
library(broom.helpers)
Warning: package 'broom.helpers' was built under R version 4.4.2

Attaching package: 'broom.helpers'

The following objects are masked from 'package:gtsummary':

    all_categorical, all_continuous, all_contrasts, all_dichotomous,
    all_interaction, all_intercepts
lr_data <- read.csv(("C:/Users/H P/OneDrive - The Goose and Duck/Desktop/R/jomresearch_data_11March2024-main/data/linear_regression.csv"))

lr_data <- 
  lr_data %>% 
  mutate(species = as.factor(species),
         species = fct_recode(species, 
                              Setosa = "0",
                              Versicolor = "1",
                              Virginica = "2"))
str(lr_data)
'data.frame':   150 obs. of  4 variables:
 $ sepal_length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
 $ sepal_width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
 $ petal_width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
 $ species     : Factor w/ 3 levels "Setosa","Versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
freq(lr_data$species)
Frequencies  
lr_data$species  
Type: Factor  

                   Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
---------------- ------ --------- -------------- --------- --------------
          Setosa     50     33.33          33.33     33.33          33.33
      Versicolor     50     33.33          66.67     33.33          66.67
       Virginica     50     33.33         100.00     33.33         100.00
            <NA>      0                               0.00         100.00
           Total    150    100.00         100.00    100.00         100.00
skimr::skim(lr_data)
Data summary
Name lr_data
Number of rows 150
Number of columns 4
_______________________
Column type frequency:
factor 1
numeric 3
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
species 0 1 FALSE 3 Set: 50, Ver: 50, Vir: 50

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
sepal_length 0 1 5.84 0.83 4.3 5.1 5.8 6.4 7.9 ▆▇▇▅▂
sepal_width 0 1 3.05 0.43 2.0 2.8 3.0 3.3 4.4 ▁▅▇▂▁
petal_width 0 1 1.20 0.76 0.1 0.3 1.3 1.8 2.5 ▇▁▇▅▃
freq(lr_data$species)
Frequencies  
lr_data$species  
Type: Factor  

                   Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
---------------- ------ --------- -------------- --------- --------------
          Setosa     50     33.33          33.33     33.33          33.33
      Versicolor     50     33.33          66.67     33.33          66.67
       Virginica     50     33.33         100.00     33.33         100.00
            <NA>      0                               0.00         100.00
           Total    150    100.00         100.00    100.00         100.00
descr(lr_data %>% select(-species))
Descriptive Statistics  
lr_data  
N: 150  

                    petal_width   sepal_length   sepal_width
----------------- ------------- -------------- -------------
             Mean          1.20           5.84          3.05
          Std.Dev          0.76           0.83          0.43
              Min          0.10           4.30          2.00
               Q1          0.30           5.10          2.80
           Median          1.30           5.80          3.00
               Q3          1.80           6.40          3.30
              Max          2.50           7.90          4.40
              MAD          1.04           1.04          0.37
              IQR          1.50           1.30          0.50
               CV          0.64           0.14          0.14
         Skewness         -0.10           0.31          0.33
      SE.Skewness          0.20           0.20          0.20
         Kurtosis         -1.36          -0.61          0.20
          N.Valid        150.00         150.00        150.00
        Pct.Valid        100.00         100.00        100.00
zero_mod <- lm(sepal_length ~ 1, data = lr_data)
add1(zero_mod, scope = ~ sepal_width + petal_width + species, test = "F")
Single term additions

Model:
sepal_length ~ 1
            Df Sum of Sq     RSS      AIC  F value Pr(>F)    
<none>                   102.168  -55.602                    
sepal_width  1     1.222 100.946  -55.407   1.7918 0.1828    
petal_width  1    68.356  33.813 -219.469 299.1950 <2e-16 ***
species      2    63.212  38.956 -196.230 119.2645 <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
full_mod <- lm(sepal_length ~., data = lr_data)
forw <- step(zero_mod, scope =~ sepal_width + petal_width + species, direction = "forward")
Start:  AIC=-55.6
sepal_length ~ 1

              Df Sum of Sq     RSS      AIC
+ petal_width  1    68.356  33.813 -219.469
+ species      2    63.212  38.956 -196.230
<none>                     102.168  -55.602
+ sepal_width  1     1.222 100.946  -55.407

Step:  AIC=-219.47
sepal_length ~ petal_width

              Df Sum of Sq    RSS     AIC
+ sepal_width  1    3.8885 29.924 -235.79
<none>                     33.813 -219.47
+ species      2    0.0360 33.777 -215.63

Step:  AIC=-235.79
sepal_length ~ petal_width + sepal_width

          Df Sum of Sq    RSS     AIC
+ species  2    2.5726 27.352 -245.28
<none>                 29.924 -235.79

Step:  AIC=-245.28
sepal_length ~ petal_width + sepal_width + species
back <- step(full_mod, direction = "backward")
Start:  AIC=-245.28
sepal_length ~ sepal_width + petal_width + species

              Df Sum of Sq    RSS     AIC
<none>                     27.352 -245.28
- petal_width  1    0.6139 27.966 -243.95
- species      2    2.5726 29.924 -235.79
- sepal_width  1    6.4251 33.777 -215.63
AIC(forw, back) #both and back has the lowest AIC, also can look at adjusted R^2
     df      AIC
forw  6 182.4032
back  6 182.4032
BIC(forw, back)
     df     BIC
forw  6 200.467
back  6 200.467
prefinal_mod <- back
summary(prefinal_mod)

Call:
lm(formula = sepal_length ~ sepal_width + petal_width + species, 
    data = lr_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.23928 -0.23774 -0.02426  0.18680  1.27525 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)         2.5270     0.3936   6.420 1.83e-09 ***
sepal_width         0.6996     0.1199   5.836 3.35e-08 ***
petal_width         0.3592     0.1991   1.804 0.073304 .  
speciesVersicolor   0.9947     0.2757   3.608 0.000424 ***
speciesVirginica    1.2526     0.3929   3.188 0.001754 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.4343 on 145 degrees of freedom
Multiple R-squared:  0.7323,    Adjusted R-squared:  0.7249 
F-statistic: 99.16 on 4 and 145 DF,  p-value: < 2.2e-16
int_mod1 <- update(prefinal_mod, .~. + sepal_width*petal_width)
summary(int_mod1) 

Call:
lm(formula = sepal_length ~ sepal_width + petal_width + species + 
    sepal_width:petal_width, data = lr_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.25750 -0.24792 -0.02509  0.19002  1.22940 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)              2.30402    0.61196   3.765 0.000242 ***
sepal_width              0.76522    0.18267   4.189 4.86e-05 ***
petal_width              0.59291    0.52929   1.120 0.264488    
speciesVersicolor        0.98190    0.27776   3.535 0.000549 ***
speciesVirginica         1.22522    0.39808   3.078 0.002497 ** 
sepal_width:petal_width -0.06888    0.14446  -0.477 0.634215    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.4355 on 144 degrees of freedom
Multiple R-squared:  0.7327,    Adjusted R-squared:  0.7234 
F-statistic: 78.95 on 5 and 144 DF,  p-value: < 2.2e-16
int_mod2 <- update(prefinal_mod, .~. + sepal_width*species)
summary(int_mod2) 

Call:
lm(formula = sepal_length ~ sepal_width + petal_width + species + 
    sepal_width:species, data = lr_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.22405 -0.24793 -0.01309  0.18311  1.29949 

Coefficients:
                              Estimate Std. Error t value Pr(>|t|)    
(Intercept)                    2.65315    0.56368   4.707 5.88e-06 ***
sepal_width                    0.66437    0.16477   4.032 8.95e-05 ***
petal_width                    0.33616    0.21287   1.579    0.117    
speciesVersicolor              0.83048    0.79192   1.049    0.296    
speciesVirginica               1.03046    0.82145   1.254    0.212    
sepal_width:speciesVersicolor  0.06004    0.26778   0.224    0.823    
sepal_width:speciesVirginica   0.08322    0.26625   0.313    0.755    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.4372 on 143 degrees of freedom
Multiple R-squared:  0.7325,    Adjusted R-squared:  0.7213 
F-statistic: 65.26 on 6 and 143 DF,  p-value: < 2.2e-16
int_mod3 <- update(prefinal_mod, .~. + petal_width*species)
summary(int_mod3) 

Call:
lm(formula = sepal_length ~ sepal_width + petal_width + species + 
    petal_width:species, data = lr_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.29319 -0.23539 -0.01804  0.19273  1.30623 

Coefficients:
                              Estimate Std. Error t value Pr(>|t|)    
(Intercept)                    2.61950    0.41455   6.319 3.14e-09 ***
sepal_width                    0.68109    0.12119   5.620 9.67e-08 ***
petal_width                    0.23991    0.59190   0.405   0.6858    
speciesVersicolor              0.49010    0.49880   0.983   0.3275    
speciesVirginica               1.49563    0.51940   2.880   0.0046 ** 
petal_width:speciesVersicolor  0.46883    0.65920   0.711   0.4781    
petal_width:speciesVirginica  -0.01912    0.62363  -0.031   0.9756    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.4349 on 143 degrees of freedom
Multiple R-squared:  0.7353,    Adjusted R-squared:  0.7242 
F-statistic:  66.2 on 6 and 143 DF,  p-value: < 2.2e-16
lr_data$res <- resid(prefinal_mod)
lr_data$pred <- fitted.values(prefinal_mod)
lr_data %>% 
  ggplot(aes(res)) +
  geom_histogram()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

lr_data %>% 
  ggplot(aes(sample = res)) +
  geom_qq() +
  geom_qq_line() 

ggplot(lr_data, aes(pred, res)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = 2)

ggplot(lr_data, aes(sepal_width, res)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = 2)

ggplot(lr_data, aes(petal_width, res)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = 2)

out_data <- cooks.distance(prefinal_mod)  
sum(out_data > 1)
[1] 0
install.packages("car")
Installing package into 'C:/Users/H P/AppData/Local/R/win-library/4.4'
(as 'lib' is unspecified)
package 'car' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
    C:\Users\H P\AppData\Local\Temp\RtmpOojPuk\downloaded_packages
library(car)
Warning: package 'car' was built under R version 4.4.2
Loading required package: carData

Attaching package: 'car'
The following object is masked from 'package:dplyr':

    recode
The following object is masked from 'package:purrr':

    some
car::vif(prefinal_mod) #all aGSIF should be < 3.16 
                 GVIF Df GVIF^(1/(2*Df))
sepal_width  2.134107  1        1.460858
petal_width 18.236611  1        4.270435
species     26.176292  2        2.261919
summary(prefinal_mod)

Call:
lm(formula = sepal_length ~ sepal_width + petal_width + species, 
    data = lr_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.23928 -0.23774 -0.02426  0.18680  1.27525 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)         2.5270     0.3936   6.420 1.83e-09 ***
sepal_width         0.6996     0.1199   5.836 3.35e-08 ***
petal_width         0.3592     0.1991   1.804 0.073304 .  
speciesVersicolor   0.9947     0.2757   3.608 0.000424 ***
speciesVirginica    1.2526     0.3929   3.188 0.001754 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.4343 on 145 degrees of freedom
Multiple R-squared:  0.7323,    Adjusted R-squared:  0.7249 
F-statistic: 99.16 on 4 and 145 DF,  p-value: < 2.2e-16
final_mod <- prefinal_mod
summary(final_mod)

Call:
lm(formula = sepal_length ~ sepal_width + petal_width + species, 
    data = lr_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.23928 -0.23774 -0.02426  0.18680  1.27525 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)         2.5270     0.3936   6.420 1.83e-09 ***
sepal_width         0.6996     0.1199   5.836 3.35e-08 ***
petal_width         0.3592     0.1991   1.804 0.073304 .  
speciesVersicolor   0.9947     0.2757   3.608 0.000424 ***
speciesVirginica    1.2526     0.3929   3.188 0.001754 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.4343 on 145 degrees of freedom
Multiple R-squared:  0.7323,    Adjusted R-squared:  0.7249 
F-statistic: 99.16 on 4 and 145 DF,  p-value: < 2.2e-16
confint(final_mod) %>% round(digits = 2)
                  2.5 % 97.5 %
(Intercept)        1.75   3.31
sepal_width        0.46   0.94
petal_width       -0.03   0.75
speciesVersicolor  0.45   1.54
speciesVirginica   0.48   2.03
library(gtsummary)

final_mod %>% 
  tbl_regression(
    label = list(sepal_width ~ "Sepal Width",
                 petal_width ~ "Petal Width",
                 species ~ "Species"),
    estimate_fun = purrr::partial(style_ratio, digits = 2),
    pvalue_fun = function (x) style_pvalue(x, digits = 3)
  )
Characteristic Beta 95% CI1 p-value
Sepal Width 0.70 0.46, 0.94 <0.001
Petal Width 0.36 -0.03, 0.75 0.073
Species


    Setosa
    Versicolor 1.0 0.45, 1.54 <0.001
    Virginica 1.25 0.48, 2.03 0.002
1 CI = Confidence Interval