\(~\)
\(~\)
# Load Libraries
library(openintro)
library(ggplot2)
library(tidyverse)
# To check what additional datasets are available from the OpenIntro
#package run the following code:
#data(package = "openintro")
\(~\)
\(~\)
# Loading dataset
data('gpa_study_hours', package = 'openintro')
\(~\)
\(~\)
gpa | Grade point average (GPA) of student. |
study_hours | Number of hours students study per week. |
\(~\)
# glimpse of data
glimpse(gpa_study_hours)
## Rows: 193
## Columns: 2
## $ gpa <dbl> 4.000, 3.800, 3.930, 3.400, 3.200, 3.520, 3.680, 3.400, 3.…
## $ study_hours <dbl> 10, 25, 45, 10, 4, 10, 24, 40, 10, 10, 30, 7, 15, 60, 10, …
# summary of data
summary(gpa_study_hours)
## gpa study_hours
## Min. :2.600 Min. : 2.00
## 1st Qu.:3.400 1st Qu.:10.00
## Median :3.620 Median :15.00
## Mean :3.586 Mean :17.48
## 3rd Qu.:3.800 3rd Qu.:20.00
## Max. :4.300 Max. :69.00
%>%
gpa_study_hours ggplot(aes(x = gpa, y = study_hours)) +
geom_point() +
xlab("GPA") +
ylab("Hours Studying") +
ggtitle("Intro to Stats Class")
# Creating a box plot to visualize potential outliers in data
boxplot(gpa~study_hours, data = gpa_study_hours, main = "Intro to Stats Class",
xlab = "GPA", ylab = "Study Hours")
\(~\)
# Obtaining our Linear Model
<- lm(gpa_study_hours$gpa ~ gpa_study_hours$study_hours)
my_lm summary(my_lm)
##
## Call:
## lm(formula = gpa_study_hours$gpa ~ gpa_study_hours$study_hours)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.95130 -0.19456 0.03879 0.21708 0.73872
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.527997 0.037424 94.272 <2e-16 ***
## gpa_study_hours$study_hours 0.003328 0.001794 1.855 0.0652 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2837 on 191 degrees of freedom
## Multiple R-squared: 0.01769, Adjusted R-squared: 0.01255
## F-statistic: 3.44 on 1 and 191 DF, p-value: 0.06517
# Plotting gpa and study_hours with lm line
ggplot(gpa_study_hours, aes(gpa, study_hours)) +
geom_point() +
stat_smooth(method = lm, se = TRUE) +
geom_segment(aes(xend = gpa, yend = study_hours), color = "blue", size = 0.3)
## `geom_smooth()` using formula 'y ~ x'
hist(resid(my_lm), xlab = "", main = "Histogram of Residuals")
par(mfrow = c(1, 2))
plot(my_lm)
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
https://www.analyticsvidhya.com/blog/2016/07/deeper-regression-analysis-assumptions-plots-solutions/
\(~\)
require(car)
require(lmtest)
require(Hmisc)
require(ResourceSelection)
#sample data
= c(14.55,22.18,61.69,38.02,89.29,67.57,49.16,43.74,26.37,24.74,11.57,19.61)
enroll = c(49.53,64.05,138.08,341.35,234.38,164.16,244.08,117.93,110.89,57.74,72.88,89.08)
expend = c(8.02,9.19,23.32,108.18,45.11,15.35,55.24,17.33,9.59,12.79,7.84,13.26)
rwp
#specify model, data, and level
= expend ~ enroll + rwp
model = cbind(enroll, expend, rwp)
data = .95
level
#function
= function(model, data, level) #model is the functional relationship among variables, , l is level, forecast is vector or matrix to forecast
regressit
{kdepairs(data) #scatterplot
= lm(model, y = TRUE) #runregression
myreg confint(myreg, level = level) #confidenceintervals
= summary(myreg) #regression summary
r1 = anova(myreg) #ANOVA tableforregression
r2 = coefficients(myreg) #coefficients
r3 = myreg$residuals #residuals
r4 = shapiro.test(r4) #test for normality of residuals
r5 = bptest(myreg) #Breusch-Pagan-Godfrey-Test for homoscedasticity, lmtest
r6 = ncvTest(myreg) #non-constant variance test for homoscedasticity
r7a = dwtest(myreg) #independence of residuals #test for independence of errors
r7 = vif(myreg) #look at collinearity, car package
r8 = mean(myreg$residuals)
me = mean(abs(myreg$residuals))
mad = mean(myreg$residuals^2)
mse = sqrt(mse)
rmse = 100*mean(myreg$residuals / myreg$y)
mpe = 100*mean(abs(myreg$residuals) / myreg$y)
mape = AIC(myreg)
aic = BIC(myreg)
bic = c(me, mad, rmse, mpe, mape, aic, bic)
all names(all) = c("ME","MAD","RMSE","MPE","MAPE", "AIC","BIC")
= barplot(all)
mybarplot = list(r1,r2,r3,r4,r5,r6,r7,r7a,r8,all)
mylist return(mylist)
}
regressit(model,data,level)
## [[1]]
##
## Call:
## lm(formula = model, y = TRUE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29.796 -12.088 -3.396 14.956 29.608
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 26.5815 11.6548 2.281 0.04850 *
## enroll 1.1505 0.2673 4.304 0.00198 **
## rwp 2.5404 0.2164 11.737 9.3e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.08 on 9 degrees of freedom
## Multiple R-squared: 0.9597, Adjusted R-squared: 0.9507
## F-statistic: 107.1 on 2 and 9 DF, p-value: 5.3e-07
##
##
## [[2]]
## Analysis of Variance Table
##
## Response: expend
## Df Sum Sq Mean Sq F value Pr(>F)
## enroll 1 30854 30854 76.529 1.076e-05 ***
## rwp 1 55537 55537 137.750 9.300e-07 ***
## Residuals 9 3629 403
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## [[3]]
## (Intercept) enroll rwp
## 26.581450 1.150469 2.540402
##
## [[4]]
## 1 2 3 4 5 6 7
## -14.164793 -11.395139 -18.716035 -3.792960 -9.524329 20.846214 20.609704
## 8 9 10 11 12
## -2.998115 29.608237 -29.795786 13.070876 6.252129
##
## [[5]]
##
## Shapiro-Wilk normality test
##
## data: r4
## W = 0.96592, p-value = 0.8637
##
##
## [[6]]
##
## studentized Breusch-Pagan test
##
## data: myreg
## BP = 1.0515, df = 2, p-value = 0.5911
##
##
## [[7]]
##
## Durbin-Watson test
##
## data: myreg
## DW = 2.2799, p-value = 0.6346
## alternative hypothesis: true autocorrelation is greater than 0
##
##
## [[8]]
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 0.4461112, Df = 1, p = 0.50419
##
## [[9]]
## enroll rwp
## 1.12679 1.12679
##
## [[10]]
## ME MAD RMSE MPE MAPE
## -7.033219e-16 1.506453e+01 1.738909e+01 -3.872326e+00 1.600507e+01
## AIC BIC
## 1.105948e+02 1.125344e+02