Final Exam Optimal Model/Best fit

getwd()

## [1] "C:/Users/antho/OneDrive/Documents/School/4.DataSecurity&Governance"

data<-read.csv("finalexam.csv",header = TRUE,sep = ",")
data

summary(data)

##     region               pop              income          ipaddr       
##  Length:29          Min.   :   2311   Min.   :26784   Min.   :    637  
##  Class :character   1st Qu.:  19005   1st Qu.:37970   1st Qu.:  12294  
##  Mode  :character   Median :  32122   Median :41595   Median :  30418  
##                     Mean   : 135940   Mean   :43858   Mean   : 440130  
##                     3rd Qu.: 101482   3rd Qu.:47469   3rd Qu.: 102104  
##                     Max.   :1554720   Max.   :70821   Max.   :5394949  
##     ufo2010         infections  
##  Min.   :  0.00   Min.   :  39  
##  1st Qu.:  0.00   1st Qu.: 123  
##  Median :  2.00   Median : 245  
##  Mean   : 16.66   Mean   :1117  
##  3rd Qu.:  9.00   3rd Qu.: 672  
##  Max.   :169.00   Max.   :6781

str(data)

## 'data.frame':    29 obs. of  6 variables:
##  $ region    : chr  "East" "East" "East" "West" ...
##  $ pop       : int  25101 61912 33341 409061 7481 18675 25581 22286 459598 3915 ...
##  $ income    : int  34670 37970 41595 55304 47623 31775 33157 31038 56089 36845 ...
##  $ ipaddr    : int  30330 38203 41338 1035427 3762 14303 75347 4543 3206226 1916 ...
##  $ ufo2010   : int  2 6 2 59 0 1 1 0 115 0 ...
##  $ infections: int  245 215 2076 5023 189 195 123 116 3298 430 ...

head(data)

model1<-lm(infections~income+pop,data = data)
summary(model1)

## 
## Call:
## lm(formula = infections ~ income + pop, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1676.4  -809.5  -249.5    83.6  6087.0 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -1.888e+03  1.649e+03  -1.145   0.2626  
## income       6.161e-02  3.969e-02   1.552   0.1327  
## pop          2.227e-03  1.293e-03   1.723   0.0968 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1489 on 26 degrees of freedom
## Multiple R-squared:  0.3955, Adjusted R-squared:  0.349 
## F-statistic: 8.504 on 2 and 26 DF,  p-value: 0.00144

#install.packages("car")

library(car)

## Warning: package 'car' was built under R version 4.2.1

## Loading required package: carData

## Warning: package 'carData' was built under R version 4.2.1

vif(model1)

##   income      pop 
## 1.881562 1.881562

confint(model1)

##                     2.5 %       97.5 %
## (Intercept) -5.277208e+03 1.501182e+03
## income      -1.996819e-02 1.431971e-01
## pop         -4.300653e-04 4.884768e-03

data1<-data[,c(2,3,4,5,6)]

cor.data<-cor(data1)
cor.data

##                  pop    income    ipaddr   ufo2010 infections
## pop        1.0000000 0.6844901 0.9511769 0.9418469  0.5826141
## income     0.6844901 1.0000000 0.6928019 0.6840534  0.5713655
## ipaddr     0.9511769 0.6928019 1.0000000 0.9829151  0.5696741
## ufo2010    0.9418469 0.6840534 0.9829151 1.0000000  0.6005828
## infections 0.5826141 0.5713655 0.5696741 0.6005828  1.0000000

#install.packages("Corrplot")

library(corrplot)

## Warning: package 'corrplot' was built under R version 4.2.1

## corrplot 0.92 loaded

#install.packages("leaps")

library("leaps")

## Warning: package 'leaps' was built under R version 4.2.1

model2<-lm(infections~.,data = data1)
summary(model2)

## 
## Call:
## lm(formula = infections ~ ., data = data1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1575.4  -735.6  -332.8   178.0  6131.8 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.976e+03  1.696e+03  -1.165    0.255
## pop          1.612e-03  3.110e-03   0.518    0.609
## income       6.079e-02  4.057e-02   1.499    0.147
## ipaddr      -1.585e-03  1.455e-03  -1.089    0.287
## ufo2010      5.438e+01  4.080e+01   1.333    0.195
## 
## Residual standard error: 1496 on 24 degrees of freedom
## Multiple R-squared:  0.4372, Adjusted R-squared:  0.3434 
## F-statistic:  4.66 on 4 and 24 DF,  p-value: 0.006312

vif(model2)

##       pop    income    ipaddr   ufo2010 
## 10.792430  1.948759 35.836092 29.965153

submodel2<-regsubsets(infections~.,data = data1)
best.summary<-summary(submodel2)

names(best.summary)

## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"

which.min(best.summary$rss)

## [1] 4

par(mfrow=c(1,2))

plot(best.summary$cp,xlab = "number of features",ylab = "cp")

which.min(best.summary$bic)

## [1] 1

which.max(best.summary$adjr2)

## [1] 2

bestsummary.fit1<-lm(infections~income+ufo2010,data = data1)

summary(bestsummary.fit1)

## 
## Call:
## lm(formula = infections ~ income + ufo2010, data = data1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1686.3  -715.9  -360.5    70.1  6096.2 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -1.713e+03  1.633e+03  -1.049   0.3037  
## income       5.725e-02  3.922e-02   1.460   0.1563  
## ufo2010      1.919e+01  1.006e+01   1.907   0.0676 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1472 on 26 degrees of freedom
## Multiple R-squared:  0.4091, Adjusted R-squared:  0.3637 
## F-statistic: 9.002 on 2 and 26 DF,  p-value: 0.00107

Make a decision that you think can help us get the best results out of this model.

par(mfrow=c(2,2))
plot(bestsummary.fit1)

library(car)
vif(bestsummary.fit1)

##   income  ufo2010 
## 1.879448 1.879448

bestsummary.fit2<-lm(infections~ufo2010,data = data1)
summary(bestsummary.fit2)

## 
## Call:
## lm(formula = infections ~ ufo2010, data = data1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1187.9  -614.3  -514.2  -200.2  6092.3 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   630.20     305.67   2.062 0.048982 *  
## ufo2010        29.23       7.49   3.903 0.000572 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1503 on 27 degrees of freedom
## Multiple R-squared:  0.3607, Adjusted R-squared:  0.337 
## F-statistic: 15.23 on 1 and 27 DF,  p-value: 0.0005717

bestsummary.fit3<-lm(infections~income,data = data1)
summary(bestsummary.fit3)

## 
## Call:
## lm(formula = infections ~ income, data = data1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1947.4  -853.3  -253.3   388.0  5950.1 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -3.638e+03  1.345e+03  -2.704  0.01170 * 
## income       1.084e-01  2.997e-02   3.618  0.00121 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1543 on 27 degrees of freedom
## Multiple R-squared:  0.3265, Adjusted R-squared:  0.3015 
## F-statistic: 13.09 on 1 and 27 DF,  p-value: 0.001206

Let us run the Breuch-Pagan (BP)test in order to address the heteroskedasticity issue:

\(H_0\):\(sigma^2=0\) \(H_a\):\(Otherwise\)

#install.packages("lmtest")

library(lmtest)

## Warning: package 'lmtest' was built under R version 4.2.1

## Loading required package: zoo

## Warning: package 'zoo' was built under R version 4.2.1

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

bptest(bestsummary.fit2)

## 
##  studentized Breusch-Pagan test
## 
## data:  bestsummary.fit2
## BP = 0.055809, df = 1, p-value = 0.8132

P-value =0.8132 so we do not have enough evidence to reject the null(error variance=0)

plot(bestsummary.fit2$fitted.values,data1$infections,xlab = "predicted",ylab = "actual",main = "Actual vs Predicted")

data1["Actual"]=data$ufo2010
#create the vector actual
data1$Forecast=predict(bestsummary.fit2)
#populate forecast with the predicted values

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.2.1

ggplot(data1,aes(x=Forecast,y=Actual))+geom_point()

geom_smooth(method = lm)

## geom_smooth: na.rm = FALSE, orientation = NA, se = TRUE
## stat_smooth: na.rm = FALSE, orientation = NA, se = TRUE, method = function (formula, data, subset, weights, na.action, method = "qr", model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = TRUE, contrasts = NULL, offset, ...) 
## {
##     ret.x <- x
##     ret.y <- y
##     cl <- match.call()
##     mf <- match.call(expand.dots = FALSE)
##     m <- match(c("formula", "data", "subset", "weights", "na.action", "offset"), names(mf), 0)
##     mf <- mf[c(1, m)]
##     mf$drop.unused.levels <- TRUE
##     mf[[1]] <- quote(stats::model.frame)
##     mf <- eval(mf, parent.frame())
##     if (method == "model.frame") 
##         return(mf)
##     else if (method != "qr") 
##         warning(gettextf("method = '%s' is not supported. Using 'qr'", method), domain = NA)
##     mt <- attr(mf, "terms")
##     y <- model.response(mf, "numeric")
##     w <- as.vector(model.weights(mf))
##     if (!is.null(w) && !is.numeric(w)) 
##         stop("'weights' must be a numeric vector")
##     offset <- model.offset(mf)
##     mlm <- is.matrix(y)
##     ny <- if (mlm) 
##         nrow(y)
##     else length(y)
##     if (!is.null(offset)) {
##         if (!mlm) 
##             offset <- as.vector(offset)
##         if (NROW(offset) != ny) 
##             stop(gettextf("number of offsets is %d, should equal %d (number of observations)", NROW(offset), ny), domain = NA)
##     }
##     if (is.empty.model(mt)) {
##         x <- NULL
##         z <- list(coefficients = if (mlm) matrix(NA, 0, ncol(y)) else numeric(), residuals = y, fitted.values = 0 * y, weights = w, rank = 0, df.residual = if (!is.null(w)) sum(w != 0) else ny)
##         if (!is.null(offset)) {
##             z$fitted.values <- offset
##             z$residuals <- y - offset
##         }
##     }
##     else {
##         x <- model.matrix(mt, mf, contrasts)
##         z <- if (is.null(w)) 
##             lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...)
##         else lm.wfit(x, y, w, offset = offset, singular.ok = singular.ok, ...)
##     }
##     class(z) <- c(if (mlm) "mlm", "lm")
##     z$na.action <- attr(mf, "na.action")
##     z$offset <- offset
##     z$contrasts <- attr(x, "contrasts")
##     z$xlevels <- .getXlevels(mt, mf)
##     z$call <- cl
##     z$terms <- mt
##     if (model) 
##         z$model <- mf
##     if (ret.x) 
##         z$x <- x
##     if (ret.y) 
##         z$y <- y
##     if (!qr) 
##         z$qr <- NULL
##     z
## }
## position_identity

I would like you to add a title to the plot obtained above.

We need this package in order to calculate the Prediction Error Sum of Squares.

#install.packages("MPV")

library(MPV)

## Warning: package 'MPV' was built under R version 4.2.1

## Loading required package: lattice

## Loading required package: KernSmooth

## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009

Let us examine the prediction sum of squares(Press) statistic and selecting the model that has the lowest value.

PRESS(model2)#every quantitative predictor

## [1] 76844251

PRESS(bestsummary.fit1)#income+ufo2010

## [1] 63477705

PRESS(bestsummary.fit3)#income

## [1] 73858528

PRESS(bestsummary.fit2)#ufo

## [1] 67273112

#Other linear considerations Let us discuss first the inclusion of a quantitative variable and second the inclusion of an interaction term.

finaldata<-read.csv("finalexam.csv",header = TRUE,sep = ",")
finaldata

#install.packages("ISLR")

library(ISLR)

## Warning: package 'ISLR' was built under R version 4.2.1

str(data)

## 'data.frame':    29 obs. of  6 variables:
##  $ region    : chr  "East" "East" "East" "West" ...
##  $ pop       : int  25101 61912 33341 409061 7481 18675 25581 22286 459598 3915 ...
##  $ income    : int  34670 37970 41595 55304 47623 31775 33157 31038 56089 36845 ...
##  $ ipaddr    : int  30330 38203 41338 1035427 3762 14303 75347 4543 3206226 1916 ...
##  $ ufo2010   : int  2 6 2 59 0 1 1 0 115 0 ...
##  $ infections: int  245 215 2076 5023 189 195 123 116 3298 430 ...

qualitative.fit<-lm(infections~region+ufo2010,data = finaldata)
summary(qualitative.fit)

## 
## Call:
## lm(formula = infections ~ region + ufo2010, data = finaldata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2263.7  -587.4   -82.4    87.4  4559.1 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  134.397    417.051   0.322   0.7499  
## regionEast   510.778    594.701   0.859   0.3986  
## regionWest  2052.247    788.274   2.603   0.0153 *
## ufo2010       17.626      8.201   2.149   0.0415 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1383 on 25 degrees of freedom
## Multiple R-squared:  0.4986, Adjusted R-squared:  0.4385 
## F-statistic: 8.288 on 3 and 25 DF,  p-value: 0.0005367

incufo<-finaldata$ufo2010*finaldata$income

interaction.fit<-lm(infections~region+incufo,data = finaldata)
summary(interaction.fit)

## 
## Call:
## lm(formula = infections ~ region + incufo, data = finaldata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2206.5  -578.0   -88.8    93.4  4548.9 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 1.408e+02  4.129e+02   0.341   0.7360  
## regionEast  5.486e+02  5.866e+02   0.935   0.3586  
## regionWest  2.069e+03  7.663e+02   2.700   0.0123 *
## incufo      2.761e-04  1.209e-04   2.283   0.0312 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1369 on 25 degrees of freedom
## Multiple R-squared:  0.5085, Adjusted R-squared:  0.4495 
## F-statistic: 8.621 on 3 and 25 DF,  p-value: 0.0004223

PRESS(qualitative.fit)

## [1] 75603540

PRESS(interaction.fit)

## [1] 71569626