getwd()
## [1] "C:/Users/antho/OneDrive/Documents/School/4.DataSecurity&Governance"
data<-read.csv("finalexam.csv",header = TRUE,sep = ",")
data
summary(data)
## region pop income ipaddr
## Length:29 Min. : 2311 Min. :26784 Min. : 637
## Class :character 1st Qu.: 19005 1st Qu.:37970 1st Qu.: 12294
## Mode :character Median : 32122 Median :41595 Median : 30418
## Mean : 135940 Mean :43858 Mean : 440130
## 3rd Qu.: 101482 3rd Qu.:47469 3rd Qu.: 102104
## Max. :1554720 Max. :70821 Max. :5394949
## ufo2010 infections
## Min. : 0.00 Min. : 39
## 1st Qu.: 0.00 1st Qu.: 123
## Median : 2.00 Median : 245
## Mean : 16.66 Mean :1117
## 3rd Qu.: 9.00 3rd Qu.: 672
## Max. :169.00 Max. :6781
str(data)
## 'data.frame': 29 obs. of 6 variables:
## $ region : chr "East" "East" "East" "West" ...
## $ pop : int 25101 61912 33341 409061 7481 18675 25581 22286 459598 3915 ...
## $ income : int 34670 37970 41595 55304 47623 31775 33157 31038 56089 36845 ...
## $ ipaddr : int 30330 38203 41338 1035427 3762 14303 75347 4543 3206226 1916 ...
## $ ufo2010 : int 2 6 2 59 0 1 1 0 115 0 ...
## $ infections: int 245 215 2076 5023 189 195 123 116 3298 430 ...
head(data)
model1<-lm(infections~income+pop,data = data)
summary(model1)
##
## Call:
## lm(formula = infections ~ income + pop, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1676.4 -809.5 -249.5 83.6 6087.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.888e+03 1.649e+03 -1.145 0.2626
## income 6.161e-02 3.969e-02 1.552 0.1327
## pop 2.227e-03 1.293e-03 1.723 0.0968 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1489 on 26 degrees of freedom
## Multiple R-squared: 0.3955, Adjusted R-squared: 0.349
## F-statistic: 8.504 on 2 and 26 DF, p-value: 0.00144
#install.packages("car")
library(car)
## Warning: package 'car' was built under R version 4.2.1
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.2.1
vif(model1)
## income pop
## 1.881562 1.881562
confint(model1)
## 2.5 % 97.5 %
## (Intercept) -5.277208e+03 1.501182e+03
## income -1.996819e-02 1.431971e-01
## pop -4.300653e-04 4.884768e-03
data1<-data[,c(2,3,4,5,6)]
cor.data<-cor(data1)
cor.data
## pop income ipaddr ufo2010 infections
## pop 1.0000000 0.6844901 0.9511769 0.9418469 0.5826141
## income 0.6844901 1.0000000 0.6928019 0.6840534 0.5713655
## ipaddr 0.9511769 0.6928019 1.0000000 0.9829151 0.5696741
## ufo2010 0.9418469 0.6840534 0.9829151 1.0000000 0.6005828
## infections 0.5826141 0.5713655 0.5696741 0.6005828 1.0000000
#install.packages("Corrplot")
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.2.1
## corrplot 0.92 loaded
#install.packages("leaps")
library("leaps")
## Warning: package 'leaps' was built under R version 4.2.1
model2<-lm(infections~.,data = data1)
summary(model2)
##
## Call:
## lm(formula = infections ~ ., data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1575.4 -735.6 -332.8 178.0 6131.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.976e+03 1.696e+03 -1.165 0.255
## pop 1.612e-03 3.110e-03 0.518 0.609
## income 6.079e-02 4.057e-02 1.499 0.147
## ipaddr -1.585e-03 1.455e-03 -1.089 0.287
## ufo2010 5.438e+01 4.080e+01 1.333 0.195
##
## Residual standard error: 1496 on 24 degrees of freedom
## Multiple R-squared: 0.4372, Adjusted R-squared: 0.3434
## F-statistic: 4.66 on 4 and 24 DF, p-value: 0.006312
vif(model2)
## pop income ipaddr ufo2010
## 10.792430 1.948759 35.836092 29.965153
submodel2<-regsubsets(infections~.,data = data1)
best.summary<-summary(submodel2)
names(best.summary)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
which.min(best.summary$rss)
## [1] 4
par(mfrow=c(1,2))
plot(best.summary$cp,xlab = "number of features",ylab = "cp")
which.min(best.summary$bic)
## [1] 1
which.max(best.summary$adjr2)
## [1] 2
bestsummary.fit1<-lm(infections~income+ufo2010,data = data1)
summary(bestsummary.fit1)
##
## Call:
## lm(formula = infections ~ income + ufo2010, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1686.3 -715.9 -360.5 70.1 6096.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.713e+03 1.633e+03 -1.049 0.3037
## income 5.725e-02 3.922e-02 1.460 0.1563
## ufo2010 1.919e+01 1.006e+01 1.907 0.0676 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1472 on 26 degrees of freedom
## Multiple R-squared: 0.4091, Adjusted R-squared: 0.3637
## F-statistic: 9.002 on 2 and 26 DF, p-value: 0.00107
Make a decision that you think can help us get the best results out of this model.
par(mfrow=c(2,2))
plot(bestsummary.fit1)
library(car)
vif(bestsummary.fit1)
## income ufo2010
## 1.879448 1.879448
bestsummary.fit2<-lm(infections~ufo2010,data = data1)
summary(bestsummary.fit2)
##
## Call:
## lm(formula = infections ~ ufo2010, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1187.9 -614.3 -514.2 -200.2 6092.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 630.20 305.67 2.062 0.048982 *
## ufo2010 29.23 7.49 3.903 0.000572 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1503 on 27 degrees of freedom
## Multiple R-squared: 0.3607, Adjusted R-squared: 0.337
## F-statistic: 15.23 on 1 and 27 DF, p-value: 0.0005717
bestsummary.fit3<-lm(infections~income,data = data1)
summary(bestsummary.fit3)
##
## Call:
## lm(formula = infections ~ income, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1947.4 -853.3 -253.3 388.0 5950.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.638e+03 1.345e+03 -2.704 0.01170 *
## income 1.084e-01 2.997e-02 3.618 0.00121 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1543 on 27 degrees of freedom
## Multiple R-squared: 0.3265, Adjusted R-squared: 0.3015
## F-statistic: 13.09 on 1 and 27 DF, p-value: 0.001206
Let us run the Breuch-Pagan (BP)test in order to address the heteroskedasticity issue:
\(H_0\):\(sigma^2=0\) \(H_a\):\(Otherwise\)
#install.packages("lmtest")
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.2.1
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.2.1
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
bptest(bestsummary.fit2)
##
## studentized Breusch-Pagan test
##
## data: bestsummary.fit2
## BP = 0.055809, df = 1, p-value = 0.8132
P-value =0.8132 so we do not have enough evidence to reject the null(error variance=0)
plot(bestsummary.fit2$fitted.values,data1$infections,xlab = "predicted",ylab = "actual",main = "Actual vs Predicted")
data1["Actual"]=data$ufo2010
#create the vector actual
data1$Forecast=predict(bestsummary.fit2)
#populate forecast with the predicted values
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.1
ggplot(data1,aes(x=Forecast,y=Actual))+geom_point()
geom_smooth(method = lm)
## geom_smooth: na.rm = FALSE, orientation = NA, se = TRUE
## stat_smooth: na.rm = FALSE, orientation = NA, se = TRUE, method = function (formula, data, subset, weights, na.action, method = "qr", model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = TRUE, contrasts = NULL, offset, ...)
## {
## ret.x <- x
## ret.y <- y
## cl <- match.call()
## mf <- match.call(expand.dots = FALSE)
## m <- match(c("formula", "data", "subset", "weights", "na.action", "offset"), names(mf), 0)
## mf <- mf[c(1, m)]
## mf$drop.unused.levels <- TRUE
## mf[[1]] <- quote(stats::model.frame)
## mf <- eval(mf, parent.frame())
## if (method == "model.frame")
## return(mf)
## else if (method != "qr")
## warning(gettextf("method = '%s' is not supported. Using 'qr'", method), domain = NA)
## mt <- attr(mf, "terms")
## y <- model.response(mf, "numeric")
## w <- as.vector(model.weights(mf))
## if (!is.null(w) && !is.numeric(w))
## stop("'weights' must be a numeric vector")
## offset <- model.offset(mf)
## mlm <- is.matrix(y)
## ny <- if (mlm)
## nrow(y)
## else length(y)
## if (!is.null(offset)) {
## if (!mlm)
## offset <- as.vector(offset)
## if (NROW(offset) != ny)
## stop(gettextf("number of offsets is %d, should equal %d (number of observations)", NROW(offset), ny), domain = NA)
## }
## if (is.empty.model(mt)) {
## x <- NULL
## z <- list(coefficients = if (mlm) matrix(NA, 0, ncol(y)) else numeric(), residuals = y, fitted.values = 0 * y, weights = w, rank = 0, df.residual = if (!is.null(w)) sum(w != 0) else ny)
## if (!is.null(offset)) {
## z$fitted.values <- offset
## z$residuals <- y - offset
## }
## }
## else {
## x <- model.matrix(mt, mf, contrasts)
## z <- if (is.null(w))
## lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...)
## else lm.wfit(x, y, w, offset = offset, singular.ok = singular.ok, ...)
## }
## class(z) <- c(if (mlm) "mlm", "lm")
## z$na.action <- attr(mf, "na.action")
## z$offset <- offset
## z$contrasts <- attr(x, "contrasts")
## z$xlevels <- .getXlevels(mt, mf)
## z$call <- cl
## z$terms <- mt
## if (model)
## z$model <- mf
## if (ret.x)
## z$x <- x
## if (ret.y)
## z$y <- y
## if (!qr)
## z$qr <- NULL
## z
## }
## position_identity
I would like you to add a title to the plot obtained above.
We need this package in order to calculate the Prediction Error Sum of Squares.
#install.packages("MPV")
library(MPV)
## Warning: package 'MPV' was built under R version 4.2.1
## Loading required package: lattice
## Loading required package: KernSmooth
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
Let us examine the prediction sum of squares(Press) statistic and selecting the model that has the lowest value.
PRESS(model2)#every quantitative predictor
## [1] 76844251
PRESS(bestsummary.fit1)#income+ufo2010
## [1] 63477705
PRESS(bestsummary.fit3)#income
## [1] 73858528
PRESS(bestsummary.fit2)#ufo
## [1] 67273112
#Other linear considerations Let us discuss first the inclusion of a quantitative variable and second the inclusion of an interaction term.
finaldata<-read.csv("finalexam.csv",header = TRUE,sep = ",")
finaldata
#install.packages("ISLR")
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.2.1
str(data)
## 'data.frame': 29 obs. of 6 variables:
## $ region : chr "East" "East" "East" "West" ...
## $ pop : int 25101 61912 33341 409061 7481 18675 25581 22286 459598 3915 ...
## $ income : int 34670 37970 41595 55304 47623 31775 33157 31038 56089 36845 ...
## $ ipaddr : int 30330 38203 41338 1035427 3762 14303 75347 4543 3206226 1916 ...
## $ ufo2010 : int 2 6 2 59 0 1 1 0 115 0 ...
## $ infections: int 245 215 2076 5023 189 195 123 116 3298 430 ...
qualitative.fit<-lm(infections~region+ufo2010,data = finaldata)
summary(qualitative.fit)
##
## Call:
## lm(formula = infections ~ region + ufo2010, data = finaldata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2263.7 -587.4 -82.4 87.4 4559.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 134.397 417.051 0.322 0.7499
## regionEast 510.778 594.701 0.859 0.3986
## regionWest 2052.247 788.274 2.603 0.0153 *
## ufo2010 17.626 8.201 2.149 0.0415 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1383 on 25 degrees of freedom
## Multiple R-squared: 0.4986, Adjusted R-squared: 0.4385
## F-statistic: 8.288 on 3 and 25 DF, p-value: 0.0005367
incufo<-finaldata$ufo2010*finaldata$income
interaction.fit<-lm(infections~region+incufo,data = finaldata)
summary(interaction.fit)
##
## Call:
## lm(formula = infections ~ region + incufo, data = finaldata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2206.5 -578.0 -88.8 93.4 4548.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.408e+02 4.129e+02 0.341 0.7360
## regionEast 5.486e+02 5.866e+02 0.935 0.3586
## regionWest 2.069e+03 7.663e+02 2.700 0.0123 *
## incufo 2.761e-04 1.209e-04 2.283 0.0312 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1369 on 25 degrees of freedom
## Multiple R-squared: 0.5085, Adjusted R-squared: 0.4495
## F-statistic: 8.621 on 3 and 25 DF, p-value: 0.0004223
PRESS(qualitative.fit)
## [1] 75603540
PRESS(interaction.fit)
## [1] 71569626