Phil Gonzalez_Applied Data Science_Week 4 Coding Practice

# Load the data
data("marketing", package = "datarium")
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

library(ModelMetrics)

## 
## Attaching package: 'ModelMetrics'

## The following objects are masked from 'package:caret':
## 
##     confusionMatrix, precision, recall, sensitivity, specificity

## The following object is masked from 'package:base':
## 
##     kappa

# Inspect the data
sample_n(marketing, 3) # Sample n rows from a table

##   youtube facebook newspaper sales
## 1  264.60    39.84     45.48 24.12
## 2  336.84    16.68     44.40 19.32
## 3  332.28    58.68     50.16 32.40

# Sales Distribution
p <- ggplot(marketing) +
    geom_histogram(aes(x = sales, y = ..density..),
                   binwidth = 1, fill = "grey", color = "black") + geom_density(aes(x = sales, color = "red"),
                   show.legend = FALSE)
p + theme_bw()

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

preproc1 <- preProcess(marketing, method = c("center", "scale"))
norm1 <- predict(preproc1, marketing)
head(norm1)

##       youtube   facebook newspaper      sales
## 1  0.96742460  0.9790656 1.7744925  1.5481681
## 2 -1.19437904  1.0800974 0.6679027 -0.6943038
## 3 -1.51235985  1.5246374 1.7790842 -0.9051345
## 4  0.05191939  1.2148065 1.2831850  0.8581768
## 5  0.39319551 -0.8395070 1.2785934 -0.2151431
## 6 -1.61136487  1.7267010 2.0408088 -1.3076295

preproc2 <- preProcess(marketing, method = c("range"))
norm2 <- predict(preproc2, marketing)
head(norm2)

##      youtube  facebook newspaper     sales
## 1 0.77578627 0.7620968 0.6059807 0.8070866
## 2 0.14812310 0.7923387 0.3940193 0.3464567
## 3 0.05579980 0.9254032 0.6068602 0.3031496
## 4 0.50997633 0.8326613 0.5118734 0.6653543
## 5 0.60906324 0.2177419 0.5109938 0.4448819
## 6 0.02705445 0.9858871 0.6569921 0.2204724

library(corrplot)

## corrplot 0.95 loaded

M <- cor(norm1)
p.mat <- cor.mtest(norm1)
# print(p.mat)
corrplot(M, type = "upper", order = "hclust",
         p.mat = p.mat$p, sig.level = 0.05)

set.seed(123)
training.samples <- createDataPartition(y = norm1$sales, p = 0.8, list = FALSE)
train.data <- norm1[training.samples,] #162 (80%)
test.data <- norm1[-training.samples, ] #38 (20%)

#Build the model
model <- lm(sales ~., data = train.data)

# Make predictions
predictions <- predict(model, test.data)

#Model performance
data.frame(RMSE = RMSE(predictions, test.data$sales),
           R2 = R2(predictions, test.data$sales),
           MAE = MAE(predictions, test.data$sales),
           MSE = mse(predictions, test.data$sales))

##        RMSE        R2       MAE        MSE
## 1 0.3139314 0.9049049 0.2289764 0.09855291

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

vif(model)

##   youtube  facebook newspaper 
##  1.004440  1.118155  1.115449

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ lubridate 1.9.4     ✔ tibble    3.3.0
## ✔ purrr     1.1.0     ✔ tidyr     1.3.1
## ✔ readr     2.1.5     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ✖ car::recode()   masks dplyr::recode()
## ✖ purrr::some()   masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(modelr)

## 
## Attaching package: 'modelr'
## 
## The following objects are masked from 'package:ModelMetrics':
## 
##     mae, mse, rmse

library(broom)

## 
## Attaching package: 'broom'
## 
## The following object is masked from 'package:modelr':
## 
##     bootstrap

# Load the data
data("swiss")
sample_n(swiss, 3)

##              Fertility Agriculture Examination Education Catholic
## Veveyse           87.1        64.5          14         6    98.61
## Lavaux            65.1        73.0          19         9     2.84
## La Chauxdfnd      65.7         7.7          29        11    13.79
##              Infant.Mortality
## Veveyse                  24.5
## Lavaux                   20.0
## La Chauxdfnd             20.5

model1 <- lm(Fertility ~., data = swiss)
model2 <- lm(Fertility ~., -Examination, data = swiss)

summary(model1)

## 
## Call:
## lm(formula = Fertility ~ ., data = swiss)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.2743  -5.2617   0.5032   4.1198  15.3213 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      66.91518   10.70604   6.250 1.91e-07 ***
## Agriculture      -0.17211    0.07030  -2.448  0.01873 *  
## Examination      -0.25801    0.25388  -1.016  0.31546    
## Education        -0.87094    0.18303  -4.758 2.43e-05 ***
## Catholic          0.10412    0.03526   2.953  0.00519 ** 
## Infant.Mortality  1.07705    0.38172   2.822  0.00734 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.165 on 41 degrees of freedom
## Multiple R-squared:  0.7067, Adjusted R-squared:  0.671 
## F-statistic: 19.76 on 5 and 41 DF,  p-value: 5.594e-10

AIC(model1)

## [1] 326.0716

BIC(model1)

## [1] 339.0226

library(modelr)
data.frame(
  R2 = rsquare(model1, data = swiss),
  RMSE = rmse(model1, data = swiss),
  MAE = mae(model1, data = swiss)
)

##         R2     RMSE     MAE
## 1 0.706735 6.692395 5.32138

library(caret)
predictions <- model1 %>% predict(swiss)
data.frame(
  R2 = R2(predictions, swiss$Fertility),
  RMSE = RMSE(predictions, swiss$Fertility),
  MAE = MAE(predictions, swiss$Fertility)
)

##         R2     RMSE     MAE
## 1 0.706735 6.692395 5.32138

library(broom)
glance(model1)

## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.707         0.671  7.17      19.8 5.59e-10     5  -156.  326.  339.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

swiss %>%
  add_predictions(model1) %>%
  summarise(
    R2 = cor(Fertility, pred)^2,
    MSE = mean((Fertility - pred)^2),
    RMSE = sqrt(MSE),
    MAE = mean(abs(Fertility - pred))
  )

##         R2      MSE     RMSE     MAE
## 1 0.706735 44.78815 6.692395 5.32138

# Metrics for model 1
glance(model1) %>%
  dplyr::select(adj.r.squared, sigma, AIC, BIC, p.value)

## # A tibble: 1 × 5
##   adj.r.squared sigma   AIC   BIC  p.value
##           <dbl> <dbl> <dbl> <dbl>    <dbl>
## 1         0.671  7.17  326.  339. 5.59e-10

# Metrics for Model 2
glance(model2) %>%
  dplyr::select(adj.r.squared, sigma, AIC, BIC, p.value)

## # A tibble: 1 × 5
##   adj.r.squared sigma   AIC   BIC    p.value
##           <dbl> <dbl> <dbl> <dbl>      <dbl>
## 1         0.763  6.89  175.  183. 0.00000249

sigma(model1)/mean(swiss$Fertility)

## [1] 0.1021544

1. Linear Regression

Use lm() function in the base package and swiss data from library datasets

library(datasets)
model_swiss = lm(Fertility ~., data = swiss)
lm_coeff = model_swiss$coefficients
print(summary(model_swiss))

## 
## Call:
## lm(formula = Fertility ~ ., data = swiss)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.2743  -5.2617   0.5032   4.1198  15.3213 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      66.91518   10.70604   6.250 1.91e-07 ***
## Agriculture      -0.17211    0.07030  -2.448  0.01873 *  
## Examination      -0.25801    0.25388  -1.016  0.31546    
## Education        -0.87094    0.18303  -4.758 2.43e-05 ***
## Catholic          0.10412    0.03526   2.953  0.00519 ** 
## Infant.Mortality  1.07705    0.38172   2.822  0.00734 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.165 on 41 degrees of freedom
## Multiple R-squared:  0.7067, Adjusted R-squared:  0.671 
## F-statistic: 19.76 on 5 and 41 DF,  p-value: 5.594e-10

Note: 70% (R-squared) of the variation in Fertility rate can be explained via linear regression

2. Logistic Regression

Use glm() function and set family = "binomial" Install library bestglm

library(bestglm)

## Loading required package: leaps

data("SAheart")
model_cholesterol = glm(chd ~ ldl, data = SAheart, family = binomial)
plot(jitter(chd, factor = 0.1) ~ ldl, data = SAheart, pch = 20,
     ylab = "Probability of CHD", xlab = "Low Density Lipoprotein Cholesterol")
grid()
curve(predict(model_cholesterol, data.frame(ldl = x), type = "response"),
      add = TRUE, col = "dodgerblue", lty = 2)

summary(model_cholesterol)

## 
## Call:
## glm(formula = chd ~ ldl, family = binomial, data = SAheart)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.96867    0.27308  -7.209 5.63e-13 ***
## ldl          0.27466    0.05164   5.319 1.04e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 596.11  on 461  degrees of freedom
## Residual deviance: 564.28  on 460  degrees of freedom
## AIC: 568.28
## 
## Number of Fisher Scoring iterations: 4

3. Ridge Regression

Regularization is generally useful in the following situations: - Large number of variables - Low ratio of number observations to number of variables - High Multi-Collinearity

Use swiss data and libray glmnet (install this library). Create two different datasets from swiss, one containing dependent variable and other containing independent variables:

library(glmnet)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## Loaded glmnet 4.1-10

x = swiss[,-1]
y = swiss[,1]

Use cv.glmnet( ) function to do cross validation
Use alpha = 0 for ridge regression
Use the best lambda by using lambda.min
For more information, see - https://www.rdocumentation.org/packages/glmnet/versions/4.1-2/topics/cv.glmnet

# Setting the seed to get similar results
set.seed(123)
model_ridge = cv.glmnet(as.matrix(x),y,alpha = 0,lambda = 10^seq(4,-1,-0.1))
best_lambda = model_ridge$lambda.min
ridge_coeff = predict(model_ridge, s = best_lambda, type = "coefficients")
ridge_coeff

## 6 x 1 sparse Matrix of class "dgCMatrix"
##                   s=1.584893
## (Intercept)      62.97585936
## Agriculture      -0.09863022
## Examination      -0.33967990
## Education        -0.64733678
## Catholic          0.07703325
## Infant.Mortality  1.08821833

4. Lasso Regression

Lasso stands for Least Absolute Shrinkage and Selection Operator. - Use the same swiss dataset and X and Y - Use glmnet for cross-validation - Set standartize = TRUE (this is default)

# Setting the seed for reproducibility
set.seed(123)
model = cv.glmnet(as.matrix(x), y, alpha = 1, lambda = 10^seq(4,-1,-0.1))
#Taking the best lambda
best_lambda = model$lambda.min
lasso_coeff = predict(model, s = best_lambda, type = "coefficients")
lasso_coeff

## 6 x 1 sparse Matrix of class "dgCMatrix"
##                  s=0.1258925
## (Intercept)      65.46374579
## Agriculture      -0.14994107
## Examination      -0.24310141
## Education        -0.83632674
## Catholic          0.09913931
## Infant.Mortality  1.07238898

Note - Both ridge regression and lasso regression are addressed to deal with multicollinearity.

Phil Gonzalez_Applied Data Science_Week 4 Coding Practice_Regression in R

2025-09-17

1. Linear Regression

2. Logistic Regression

3. Ridge Regression

4. Lasso Regression