# Load the data
data("marketing", package = "datarium")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(ModelMetrics)
##
## Attaching package: 'ModelMetrics'
## The following objects are masked from 'package:caret':
##
## confusionMatrix, precision, recall, sensitivity, specificity
## The following object is masked from 'package:base':
##
## kappa
Inspect the data
# Inspect the data
sample_n(marketing, 3) # Sample n rows from a table
## youtube facebook newspaper sales
## 1 263.76 40.20 54.12 23.52
## 2 275.40 38.76 89.04 23.64
## 3 132.84 48.72 75.84 19.20
Sales Distribution
p <- ggplot(marketing) +
geom_histogram(aes(x = sales, y = after_stat(density)),
binwidth = 1, fill = "grey", color = "black") + geom_density(aes(x=sales, color="red"),
show.legend = FALSE)
p + theme_bw()
\[\begin{align*} z_i=\frac{x_{1} - \bar{x}}{\sigma} \end{align*}\]
preproc1 <- preProcess(marketing, method=c("center", "scale"))
norm1 <- predict(preproc1, marketing)
head(norm1, 3)
## youtube facebook newspaper sales
## 1 0.9674246 0.9790656 1.7744925 1.5481681
## 2 -1.1943790 1.0800974 0.6679027 -0.6943038
## 3 -1.5123599 1.5246374 1.7790842 -0.9051345
\[\begin{align*} (x - x_{min})/ (x_{max} - x_{min}) \end{align*}\]
preproc2 <- preProcess(marketing, method=c("range"))
norm2 <- predict(preproc2, marketing)
head(norm2, 3)
## youtube facebook newspaper sales
## 1 0.7757863 0.7620968 0.6059807 0.8070866
## 2 0.1481231 0.7923387 0.3940193 0.3464567
## 3 0.0557998 0.9254032 0.6068602 0.3031496
library(corrplot)
## corrplot 0.94 loaded
M <-cor(norm1)
p.mat <- cor.mtest(norm1)
#print(p.mat)
corrplot(M, type="upper", order="hclust",
p.mat = p.mat$p, sig.level = 0.05)
set.seed(123)
training.samples <- createDataPartition(y = norm1$sales, p = 0.8, list = FALSE)
train.data <- norm1[training.samples,] # 162 (80%)
test.data <- norm1[-training.samples,] # 38 (20%)
model <- lm(sales ~., data = train.data)
predictions <- predict(model, test.data)
# predictions
MSE: Mean Square Error
MAE: Mean Absolute Error
RMSE: Root Mean Square Error
R2: R-square
Model Performance:
data.frame( RMSE = RMSE(predictions, test.data$sales),
R2 = R2(predictions, test.data$sales),
MAE = MAE(predictions, test.data$sales),
MSE = mse(predictions, test.data$sales))
## RMSE R2 MAE MSE
## 1 0.3139314 0.9049049 0.2289764 0.09855291
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
vif(model)
## youtube facebook newspaper
## 1.004440 1.118155 1.115449
Use lm() function in the base package and swiss data from library datasets
##
## Call:
## lm(formula = Fertility ~ ., data = swiss)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.2743 -5.2617 0.5032 4.1198 15.3213
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 66.91518 10.70604 6.250 1.91e-07 ***
## Agriculture -0.17211 0.07030 -2.448 0.01873 *
## Examination -0.25801 0.25388 -1.016 0.31546
## Education -0.87094 0.18303 -4.758 2.43e-05 ***
## Catholic 0.10412 0.03526 2.953 0.00519 **
## Infant.Mortality 1.07705 0.38172 2.822 0.00734 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.165 on 41 degrees of freedom
## Multiple R-squared: 0.7067, Adjusted R-squared: 0.671
## F-statistic: 19.76 on 5 and 41 DF, p-value: 5.594e-10
Note: 70% (R-squared) of the variation in Fertility rate can be explained via linear regression
Use glm() function and set family = "binomial"
Install library bestglm
##
## Call:
## glm(formula = chd ~ ldl, family = binomial, data = SAheart)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.96867 0.27308 -7.209 5.63e-13 ***
## ldl 0.27466 0.05164 5.319 1.04e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 596.11 on 461 degrees of freedom
## Residual deviance: 564.28 on 460 degrees of freedom
## AIC: 568.28
##
## Number of Fisher Scoring iterations: 4
Regularization is generally useful in the following situations:
Use swiss data and libray glmnet (install thsi library). Create two different datasets from swiss, one containing dependent variable and other containing independent variables:
## 6 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 62.97585936
## Agriculture -0.09863022
## Examination -0.33967990
## Education -0.64733678
## Catholic 0.07703325
## Infant.Mortality 1.08821833
Lasso stands for Least Absolute Shrinkage and Selection Operator.
## 6 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 65.46374579
## Agriculture -0.14994107
## Examination -0.24310141
## Education -0.83632674
## Catholic 0.09913931
## Infant.Mortality 1.07238898
Note - Both ridge regression and lasso regression are addressed to deal with multicollinearity.