Part 1

# Load the data
pacman::p_load(dplyr,caret,ModelMetrics,datarium,corrplot,car,datasets,glmnet,bestglm)
data("marketing", package = "datarium")

# Inspect the data
sample_n(marketing, 3) # Sample n rows from a table

##   youtube facebook newspaper sales
## 1  212.40    11.16      7.68 15.36
## 2   90.12    42.00     63.24 15.12
## 3  260.16    52.68     32.64 26.76

p <- ggplot(marketing) +
    geom_histogram(aes(x = sales, y = ..density..),
                   binwidth = 1, fill = "grey", color = "black") + geom_density(aes(x=sales, color="red"),
                   show.legend = FALSE)
p + theme_bw()

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

preproc1 <- preProcess(marketing, method=c("center", "scale"))
norm1 <- predict(preproc1, marketing)

preproc2 <- preProcess(marketing, method=c("range"))
norm2 <- predict(preproc2, marketing)

M <-cor(norm1)
p.mat <- cor.mtest(norm1)
#print(p.mat)
corrplot(M, type="upper", order="hclust",
         p.mat = p.mat$p, sig.level = 0.05)

set.seed(123)
training.samples <- createDataPartition(y = norm1$sales, p = 0.8, list = FALSE)

train.data  <- norm1[training.samples, ] # 162 (80%)
test.data <- norm1[-training.samples, ] #38 (20%)

# Build the model
model <- lm(sales ~., data = train.data)

# Make predictions
predictions <- predict(model,test.data)

# Model performance
data.frame(RMSE = RMSE(predictions, test.data$sales),
R2 = R2(predictions, test.data$sales),
MAE = MAE(predictions, test.data$sales),
MSE = mse(predictions, test.data$sales))

##        RMSE        R2       MAE        MSE
## 1 0.3139314 0.9049049 0.2289764 0.09855291

vif(model)

##   youtube  facebook newspaper 
##  1.004440  1.118155  1.115449

Part 2

## 
## Call:
## lm(formula = Fertility ~ ., data = swiss)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.2743  -5.2617   0.5032   4.1198  15.3213 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      66.91518   10.70604   6.250 1.91e-07 ***
## Agriculture      -0.17211    0.07030  -2.448  0.01873 *  
## Examination      -0.25801    0.25388  -1.016  0.31546    
## Education        -0.87094    0.18303  -4.758 2.43e-05 ***
## Catholic          0.10412    0.03526   2.953  0.00519 ** 
## Infant.Mortality  1.07705    0.38172   2.822  0.00734 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.165 on 41 degrees of freedom
## Multiple R-squared:  0.7067, Adjusted R-squared:  0.671 
## F-statistic: 19.76 on 5 and 41 DF,  p-value: 5.594e-10

## 
## Call:
## glm(formula = chd ~ ldl, family = binomial, data = SAheart)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.96867    0.27308  -7.209 5.63e-13 ***
## ldl          0.27466    0.05164   5.319 1.04e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 596.11  on 461  degrees of freedom
## Residual deviance: 564.28  on 460  degrees of freedom
## AIC: 568.28
## 
## Number of Fisher Scoring iterations: 4

## 6 x 1 sparse Matrix of class "dgCMatrix"
##                           s1
## (Intercept)      62.97585936
## Agriculture      -0.09863022
## Examination      -0.33967990
## Education        -0.64733678
## Catholic          0.07703325
## Infant.Mortality  1.08821833

## 6 x 1 sparse Matrix of class "dgCMatrix"
##                           s1
## (Intercept)      65.46374579
## Agriculture      -0.14994107
## Examination      -0.24310141
## Education        -0.83632674
## Catholic          0.09913931
## Infant.Mortality  1.07238898

Applied Data Science Coding Practice Week 4

Alexander Watkins

2024-09-18

Part 1

Part 2