#Remember to install these packages if you haven't already done so
library(pastecs) #For creating descriptive statistic summaries
library(ggplot2) #For creating histograms with more detail than plot
library(semTools) #For skewness and kurtosis
## Loading required package: lavaan
## This is lavaan 0.6-12
## lavaan is FREE software! Please report any bugs.
##
## ###############################################################################
## This is semTools 0.5-6
## All users of R (or SEM) are invited to submit functions or ideas for functions.
## ###############################################################################
needed_packages <- c("foreign", "lm.beta", "stargazer", "car", "ppcor", "rstatix")
# Extract not installed packages
not_installed <- needed_packages[!(needed_packages %in% installed.packages()[ , "Package"])]
# Install not installed packages
if(length(not_installed)) install.packages(not_installed)
library(foreign) #To work with SPSS data
library(lm.beta) #Will allow us to isolate the beta co-efficients
library(stargazer)#For formatting outputs/tables
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(rstatix)#Bartlett and posthoc Anova testing
##
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
##
## filter
library(ppcor)#partial correlation
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:rstatix':
##
## select
library(car)#Levene's test
## Loading required package: carData
options(repos="https://cran.rstudio.com" )
needed_packages <- c("foreign", "lm.beta", "stargazer", "olsrr", "car")
# Extract not installed packages
not_installed <- needed_packages[!(needed_packages %in% installed.packages()[ , "Package"])]
# Install not installed packages
if(length(not_installed)) install.packages(not_installed)
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:MASS':
##
## cement
## The following object is masked from 'package:datasets':
##
## rivers
library(car)
library(foreign) #To work with SPSS data
library(lm.beta) #Will allow us to isolate the beta co-efficients
library(stargazer)#For formatting outputs/tables
bikeshare<- read.csv("BikeSharing.csv",header=TRUE)
After conducting important preliminaries to justify the inclusion of independent variables (predictors) in building the linear model, it can be seen that there is a relationship between the outcome variable (cnt) and the scalar independent variables (temp,hum) while the dummy variable also exhibits a differential effect on both the outcome variable and the other independent variables. Therefore it has been shown that these variables contribute to the variation in the outcome variable and statistical evidence for their inclusion in the model has been obtained.
model1<-lm(bikeshare$cnt~bikeshare$hum)
anova(model1)
## Analysis of Variance Table
##
## Response: bikeshare$cnt
## Df Sum Sq Mean Sq F value Pr(>F)
## bikeshare$hum 1 27757373 27757373 7.4619 0.006454 **
## Residuals 729 2711778019 3719860
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model1)
##
## Call:
## lm(formula = bikeshare$cnt ~ bikeshare$hum)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4741.0 -1386.9 50.3 1439.3 4036.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5364.0 322.7 16.623 < 2e-16 ***
## bikeshare$hum -1369.1 501.2 -2.732 0.00645 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1929 on 729 degrees of freedom
## Multiple R-squared: 0.01013, Adjusted R-squared: 0.008774
## F-statistic: 7.462 on 1 and 729 DF, p-value: 0.006454
lm.beta::lm.beta(model1)
##
## Call:
## lm(formula = bikeshare$cnt ~ bikeshare$hum)
##
## Standardized Coefficients::
## (Intercept) bikeshare$hum
## NA -0.1006586
stargazer(model1, type="text") #Tidy output of all the required stats
##
## ===============================================
## Dependent variable:
## ---------------------------
## cnt
## -----------------------------------------------
## hum -1,369.081***
## (501.191)
##
## Constant 5,363.986***
## (322.679)
##
## -----------------------------------------------
## Observations 731
## R2 0.010
## Adjusted R2 0.009
## Residual Std. Error 1,928.694 (df = 729)
## F Statistic 7.462*** (df = 1; 729)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
#Note: R automatically recodes categorical to be dummy variable 0 = reference (boy), 1 category of interest (girl)
model2<-lm(bikeshare$cnt~bikeshare$hum+bikeshare$weathersit)
anova(model2)
## Analysis of Variance Table
##
## Response: bikeshare$cnt
## Df Sum Sq Mean Sq F value Pr(>F)
## bikeshare$hum 1 27757373 27757373 8.1696 0.004382 **
## bikeshare$weathersit 1 238285993 238285993 70.1325 2.84e-16 ***
## Residuals 728 2473492026 3397654
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model2)
##
## Call:
## lm(formula = bikeshare$cnt ~ bikeshare$hum + bikeshare$weathersit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4680.8 -1321.8 -29.4 1465.4 4539.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5332.2 308.4 17.289 < 2e-16 ***
## bikeshare$hum 1570.1 593.8 2.644 0.00837 **
## bikeshare$weathersit -1299.9 155.2 -8.375 2.84e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1843 on 728 degrees of freedom
## Multiple R-squared: 0.09711, Adjusted R-squared: 0.09463
## F-statistic: 39.15 on 2 and 728 DF, p-value: < 2.2e-16
stargazer(model2, type="text") #Tidy output of all the required stats
##
## ===============================================
## Dependent variable:
## ---------------------------
## cnt
## -----------------------------------------------
## hum 1,570.127***
## (593.814)
##
## weathersit -1,299.859***
## (155.216)
##
## Constant 5,332.233***
## (308.411)
##
## -----------------------------------------------
## Observations 731
## R2 0.097
## Adjusted R2 0.095
## Residual Std. Error 1,843.273 (df = 728)
## F Statistic 39.151*** (df = 2; 728)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
lm.beta(model2)
##
## Call:
## lm(formula = bikeshare$cnt ~ bikeshare$hum + bikeshare$weathersit)
##
## Standardized Coefficients::
## (Intercept) bikeshare$hum bikeshare$weathersit
## NA 0.1154400 -0.3656214
stargazer(model2, model2, type="text") #Quick model comparison
##
## ===========================================================
## Dependent variable:
## ----------------------------
## cnt
## (1) (2)
## -----------------------------------------------------------
## hum 1,570.127*** 1,570.127***
## (593.814) (593.814)
##
## weathersit -1,299.859*** -1,299.859***
## (155.216) (155.216)
##
## Constant 5,332.233*** 5,332.233***
## (308.411) (308.411)
##
## -----------------------------------------------------------
## Observations 731 731
## R2 0.097 0.097
## Adjusted R2 0.095 0.095
## Residual Std. Error (df = 728) 1,843.273 1,843.273
## F Statistic (df = 2; 728) 39.151*** 39.151***
## ===========================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
The model will be assessed to ensure it meets the assumptions of No Multicollinearity (Predictors must not be highly correlated), Homoscedasticity and Normally-distributed Errors.
First, the outliers will be determined if any.If the minimum value is equal or below -3.29, or the maximum value is equal or above 3.29 then the observations are tagged as outliers. Influential observations are observation that may change the slope of the line and have a large influence on the fit of the model. ## Assess how model2 meets key assumptions of linear regression
#Influential Outliers - Cook's distance
cooksd<-sort(cooks.distance(model2))
# plot Cook's distance
plot(cooksd, pch="*", cex=2, main="Influential Obs by Cooks distance")
abline(h = 4*mean(cooksd, na.rm=T), col="red") # add cutoff line
text(x=1:length(cooksd)+1, y=cooksd, labels=ifelse(cooksd>4*mean(cooksd, na.rm=T),names(cooksd),""), col="red") # add labels
#find rows related to influential observations
influential <- as.numeric(names(cooksd)[(cooksd > 4*mean(cooksd, na.rm=T))]) # influential row numbers
stem(influential)
##
## The decimal point is 2 digit(s) to the right of the |
##
## 0 | 1222333445567791
## 2 | 046
## 4 | 566
## 6 | 073
head(bikeshare[influential, ])
## instant dteday season yr mnth holiday weekday workingday weathersit
## 23 23 2011-01-23 1 0 1 0 0 0 1
## 106 106 2011-04-16 2 0 4 0 6 0 3
## 87 87 2011-03-28 2 0 3 0 1 1 1
## 18 18 2011-01-18 1 0 1 0 2 1 2
## 9 9 2011-01-09 1 0 1 0 0 0 1
## 38 38 2011-02-07 1 0 2 0 1 1 1
## temp atemp hum windspeed casual registered cnt
## 23 0.0965217 0.0988391 0.436522 0.2466000 150 836 986
## 106 0.4308330 0.4254920 0.888333 0.3408080 121 674 795
## 87 0.2643480 0.2575740 0.302174 0.2122040 222 1806 2028
## 18 0.2166670 0.2323330 0.861667 0.1467750 9 674 683
## 9 0.1383330 0.1161750 0.434167 0.3619500 54 768 822
## 38 0.2716670 0.3036580 0.738333 0.0454083 120 1592 1712
Some of the outliers are viewed above in full
# influential observations.
head(bikeshare[influential, ]$cnt) # influential observations - look at the values of tpcoiss
## [1] 986 795 2028 683 822 1712
head(bikeshare[influential, ]$temp) # influential observations - look at the values of optimism
## [1] 0.0965217 0.4308330 0.2643480 0.2166670 0.1383330 0.2716670
head(bikeshare[influential, ]$weathersit) # influential observations -look at the values of marlow
## [1] 1 3 1 2 1 1
car::outlierTest(model2) # Bonferonni p-value for most extreme obs - Are there any cases where the outcome variable has an unusual variable for its predictor values?
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferroni p
## 27 -2.553628 0.010864 NA
An analysis of standard residuals was carried out on the data to identify any outliers, which indicated one hundred and sixty five cases for concern.There are also cases were the outcome variable has an unusual variable for its predictor values per the Bonferonni test.
Leverage points are observations that have a value of x that are far away from the mean of x.
car::leveragePlots(model2) # leverage plots
In a well-fitting regression model the variation around the line is
constant along the line. The leverage plots show this to an extent
although certain points exhibit a lot more variation than others.
Homoscedasticity ensures points have the same scatter and data values that are scattered, or spread out, to about the same extent. It is important to consider since heteroscedasticity increased bias and an increased possibility of Type I error.
#Assess homocedasticity
plot(model1,2)
plot(model1, 2)
#The Normality of residuals should follow a normal distribution with a
mean of 0 in the most ideal case. #A normal distribution of residuals
indicates that the same number of points fall above and below the line.
#The first plot is the chart of residuals vs fitted values, in the
second plot the standardised residuals are on the Y axis. If there is
absolutely no heteroscedastity, there would be a completely random,
equal distribution of points throughout the range of X axis and a flat
red line. The ideal is to see that there is no pattern in the residuals
and that they are equally spread around the y = 0 line - the dashed
line. #In this case, the red line is slightly distorted in the middle
and ends on plot 1 but is not a significant problem. Looking at the
second plot it seems that while it is a problem it is not huge. Only a
concern if there are definite patterns.
#Create histogram and density plot of the residuals
plot(density(resid(model2)))
The plot appears to skew left, further investigation is required.
#Create a QQ plotqqPlot(model, main="QQ Plot") #qq plot for studentized resid
car::qqPlot(model1, main="QQ Plot") #qq plot for studentized resid
## [1] 69 668
There seem to be a lot of points that deviate from the standard distribution, further investigation is required.
#We can make our decision based on the standardized score for skew and kurtosis
#We divide the skew statistic by the standard error to get the standardized score
#This will tell us if we have a problem
popskew<-semTools::skew(resid(model2))
popkurt<-semTools::kurtosis(resid(model2))
popskew[1]/popskew[2]
## skew (g1)
## -0.006577716
popkurt[1]/popkurt[2]
## Excess Kur (g2)
## -3.828673
#Calculating the percentage of standardized scores for the variable itself that are outside our acceptable range
#this will tell us how big a problem we have
# Calculate the percentage of standardized scores that are greater than 1.96
zpop<- abs(scale(resid(model2)))
FSA::perc(as.numeric(zpop), 1.96, "gt")
## Registered S3 methods overwritten by 'FSA':
## method from
## confint.boot car
## hist.boot car
## [1] 2.599179
FSA::perc(as.numeric(zpop), 3.29, "gt")
## [1] 0
95.86% of standardized scores falling within +/- 1.96 sd. The standard is 95% of standardized scores falling within +/- 1.96. However this falls short but is approximately 95% meaninf the distribution of residuals may be taken to be normal minimizing the possibilty of bias and the possibility of Type I error.
Collinearity occurs when two or more independent variables contain strongly redundant information Conducting MLR with collinear variables will produce invalid results.
#Calculate Collinearity
vifmodel<-car::vif(model2)
vifmodel
## bikeshare$hum bikeshare$weathersit
## 1.536886 1.536886
#Calculate tolerance
1/vifmodel
## bikeshare$hum bikeshare$weathersit
## 0.6506663 0.6506663