d3.knit

Started with importing all the libraries that might be needed to analyse the dataset.

#Remember to install these packages if you haven't already done so

library(pastecs) #For creating descriptive statistic summaries
library(ggplot2) #For creating histograms with more detail than plot
library(semTools) #For skewness and kurtosis

## Loading required package: lavaan

## This is lavaan 0.6-12
## lavaan is FREE software! Please report any bugs.

##

## ###############################################################################

## This is semTools 0.5-6

## All users of R (or SEM) are invited to submit functions or ideas for functions.

## ###############################################################################

needed_packages <- c("foreign",  "lm.beta", "stargazer", "car", "ppcor", "rstatix")                      
# Extract not installed packages
not_installed <- needed_packages[!(needed_packages %in% installed.packages()[ , "Package"])]    
# Install not installed packages
if(length(not_installed)) install.packages(not_installed) 


library(foreign) #To work with SPSS data
library(lm.beta) #Will allow us to isolate the beta co-efficients
library(stargazer)#For formatting outputs/tables

## 
## Please cite as:

##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer

library(rstatix)#Bartlett and posthoc Anova testing

## 
## Attaching package: 'rstatix'

## The following object is masked from 'package:stats':
## 
##     filter

library(ppcor)#partial correlation

## Loading required package: MASS

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:rstatix':
## 
##     select

library(car)#Levene's test

## Loading required package: carData

options(repos="https://cran.rstudio.com" )
needed_packages <- c("foreign",  "lm.beta", "stargazer", "olsrr", "car")                      
# Extract not installed packages
not_installed <- needed_packages[!(needed_packages %in% installed.packages()[ , "Package"])]    
# Install not installed packages
if(length(not_installed)) install.packages(not_installed) 

library(olsrr)

## 
## Attaching package: 'olsrr'

## The following object is masked from 'package:MASS':
## 
##     cement

## The following object is masked from 'package:datasets':
## 
##     rivers

library(car)
library(foreign) #To work with SPSS data
library(lm.beta) #Will allow us to isolate the beta co-efficients
library(stargazer)#For formatting outputs/tables

bikeshare<- read.csv("BikeSharing.csv",header=TRUE)

Justifcation for inclsusion of variables in the Linear Regression Model

After conducting important preliminaries to justify the inclusion of independent variables (predictors) in building the linear model, it can be seen that there is a relationship between the outcome variable (cnt) and the scalar independent variables (temp,hum) while the dummy variable also exhibits a differential effect on both the outcome variable and the other independent variables. Therefore it has been shown that these variables contribute to the variation in the outcome variable and statistical evidence for their inclusion in the model has been obtained.

Simple Linear Regression -NormExam predicted by Standlrt

Note: This is equivalent to doing a Pearson correlation

model1<-lm(bikeshare$cnt~bikeshare$hum)
anova(model1)

## Analysis of Variance Table
## 
## Response: bikeshare$cnt
##                Df     Sum Sq  Mean Sq F value   Pr(>F)   
## bikeshare$hum   1   27757373 27757373  7.4619 0.006454 **
## Residuals     729 2711778019  3719860                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(model1)

## 
## Call:
## lm(formula = bikeshare$cnt ~ bikeshare$hum)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4741.0 -1386.9    50.3  1439.3  4036.8 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     5364.0      322.7  16.623  < 2e-16 ***
## bikeshare$hum  -1369.1      501.2  -2.732  0.00645 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1929 on 729 degrees of freedom
## Multiple R-squared:  0.01013,    Adjusted R-squared:  0.008774 
## F-statistic: 7.462 on 1 and 729 DF,  p-value: 0.006454

lm.beta::lm.beta(model1)

## 
## Call:
## lm(formula = bikeshare$cnt ~ bikeshare$hum)
## 
## Standardized Coefficients::
##   (Intercept) bikeshare$hum 
##            NA    -0.1006586

stargazer(model1, type="text") #Tidy output of all the required stats

## 
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                                 cnt            
## -----------------------------------------------
## hum                        -1,369.081***       
##                              (501.191)         
##                                                
## Constant                   5,363.986***        
##                              (322.679)         
##                                                
## -----------------------------------------------
## Observations                    731            
## R2                             0.010           
## Adjusted R2                    0.009           
## Residual Std. Error    1,928.694 (df = 729)    
## F Statistic           7.462*** (df = 1; 729)   
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01

Multuplie Linear Regression

#Note: R automatically recodes categorical to be dummy variable 0 = reference (boy), 1 category of interest (girl)
model2<-lm(bikeshare$cnt~bikeshare$hum+bikeshare$weathersit)
anova(model2)

## Analysis of Variance Table
## 
## Response: bikeshare$cnt
##                       Df     Sum Sq   Mean Sq F value   Pr(>F)    
## bikeshare$hum          1   27757373  27757373  8.1696 0.004382 ** 
## bikeshare$weathersit   1  238285993 238285993 70.1325 2.84e-16 ***
## Residuals            728 2473492026   3397654                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(model2)

## 
## Call:
## lm(formula = bikeshare$cnt ~ bikeshare$hum + bikeshare$weathersit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4680.8 -1321.8   -29.4  1465.4  4539.6 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            5332.2      308.4  17.289  < 2e-16 ***
## bikeshare$hum          1570.1      593.8   2.644  0.00837 ** 
## bikeshare$weathersit  -1299.9      155.2  -8.375 2.84e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1843 on 728 degrees of freedom
## Multiple R-squared:  0.09711,    Adjusted R-squared:  0.09463 
## F-statistic: 39.15 on 2 and 728 DF,  p-value: < 2.2e-16

stargazer(model2, type="text") #Tidy output of all the required stats

## 
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                                 cnt            
## -----------------------------------------------
## hum                        1,570.127***        
##                              (593.814)         
##                                                
## weathersit                 -1,299.859***       
##                              (155.216)         
##                                                
## Constant                   5,332.233***        
##                              (308.411)         
##                                                
## -----------------------------------------------
## Observations                    731            
## R2                             0.097           
## Adjusted R2                    0.095           
## Residual Std. Error    1,843.273 (df = 728)    
## F Statistic           39.151*** (df = 2; 728)  
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01

lm.beta(model2)

## 
## Call:
## lm(formula = bikeshare$cnt ~ bikeshare$hum + bikeshare$weathersit)
## 
## Standardized Coefficients::
##          (Intercept)        bikeshare$hum bikeshare$weathersit 
##                   NA            0.1154400           -0.3656214

stargazer(model2, model2, type="text") #Quick model comparison

## 
## ===========================================================
##                                    Dependent variable:     
##                                ----------------------------
##                                            cnt             
##                                     (1)            (2)     
## -----------------------------------------------------------
## hum                             1,570.127***  1,570.127*** 
##                                  (593.814)      (593.814)  
##                                                            
## weathersit                     -1,299.859***  -1,299.859***
##                                  (155.216)      (155.216)  
##                                                            
## Constant                        5,332.233***  5,332.233*** 
##                                  (308.411)      (308.411)  
##                                                            
## -----------------------------------------------------------
## Observations                        731            731     
## R2                                 0.097          0.097    
## Adjusted R2                        0.095          0.095    
## Residual Std. Error (df = 728)   1,843.273      1,843.273  
## F Statistic (df = 2; 728)        39.151***      39.151***  
## ===========================================================
## Note:                           *p<0.1; **p<0.05; ***p<0.01

Testing Assumptions of the Linear Model

The model will be assessed to ensure it meets the assumptions of No Multicollinearity (Predictors must not be highly correlated), Homoscedasticity and Normally-distributed Errors.

First, the outliers will be determined if any.If the minimum value is equal or below -3.29, or the maximum value is equal or above 3.29 then the observations are tagged as outliers. Influential observations are observation that may change the slope of the line and have a large influence on the fit of the model. ## Assess how model2 meets key assumptions of linear regression

#Influential Outliers - Cook's distance
cooksd<-sort(cooks.distance(model2))
# plot Cook's distance
plot(cooksd, pch="*", cex=2, main="Influential Obs by Cooks distance")  
abline(h = 4*mean(cooksd, na.rm=T), col="red")  # add cutoff line
text(x=1:length(cooksd)+1, y=cooksd, labels=ifelse(cooksd>4*mean(cooksd, na.rm=T),names(cooksd),""), col="red")  # add labels

#find rows related to influential observations
influential <- as.numeric(names(cooksd)[(cooksd > 4*mean(cooksd, na.rm=T))])  # influential row numbers
stem(influential)

## 
##   The decimal point is 2 digit(s) to the right of the |
## 
##   0 | 1222333445567791
##   2 | 046
##   4 | 566
##   6 | 073

head(bikeshare[influential, ])

##     instant     dteday season yr mnth holiday weekday workingday weathersit
## 23       23 2011-01-23      1  0    1       0       0          0          1
## 106     106 2011-04-16      2  0    4       0       6          0          3
## 87       87 2011-03-28      2  0    3       0       1          1          1
## 18       18 2011-01-18      1  0    1       0       2          1          2
## 9         9 2011-01-09      1  0    1       0       0          0          1
## 38       38 2011-02-07      1  0    2       0       1          1          1
##          temp     atemp      hum windspeed casual registered  cnt
## 23  0.0965217 0.0988391 0.436522 0.2466000    150        836  986
## 106 0.4308330 0.4254920 0.888333 0.3408080    121        674  795
## 87  0.2643480 0.2575740 0.302174 0.2122040    222       1806 2028
## 18  0.2166670 0.2323330 0.861667 0.1467750      9        674  683
## 9   0.1383330 0.1161750 0.434167 0.3619500     54        768  822
## 38  0.2716670 0.3036580 0.738333 0.0454083    120       1592 1712

Some of the outliers are viewed above in full

# influential observations.
head(bikeshare[influential, ]$cnt)  # influential observations - look at the values of tpcoiss

## [1]  986  795 2028  683  822 1712

head(bikeshare[influential, ]$temp)  # influential observations - look at the values of optimism

## [1] 0.0965217 0.4308330 0.2643480 0.2166670 0.1383330 0.2716670

head(bikeshare[influential, ]$weathersit)  # influential observations -look at the values of marlow

## [1] 1 3 1 2 1 1

car::outlierTest(model2) # Bonferonni p-value for most extreme obs - Are there any cases where the outcome variable has an unusual variable for its predictor values?

## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
##     rstudent unadjusted p-value Bonferroni p
## 27 -2.553628           0.010864           NA

An analysis of standard residuals was carried out on the data to identify any outliers, which indicated one hundred and sixty five cases for concern.There are also cases were the outcome variable has an unusual variable for its predictor values per the Bonferonni test.

Leverage points are observations that have a value of x that are far away from the mean of x.

car::leveragePlots(model2) # leverage plots

In a well-fitting regression model the variation around the line is constant along the line. The leverage plots show this to an extent although certain points exhibit a lot more variation than others.

Homoscedasticity ensures points have the same scatter and data values that are scattered, or spread out, to about the same extent. It is important to consider since heteroscedasticity increased bias and an increased possibility of Type I error.

#Assess homocedasticity 
plot(model1,2)
plot(model1, 2)

#The Normality of residuals should follow a normal distribution with a mean of 0 in the most ideal case. #A normal distribution of residuals indicates that the same number of points fall above and below the line. #The first plot is the chart of residuals vs fitted values, in the second plot the standardised residuals are on the Y axis. If there is absolutely no heteroscedastity, there would be a completely random, equal distribution of points throughout the range of X axis and a flat red line. The ideal is to see that there is no pattern in the residuals and that they are equally spread around the y = 0 line - the dashed line. #In this case, the red line is slightly distorted in the middle and ends on plot 1 but is not a significant problem. Looking at the second plot it seems that while it is a problem it is not huge. Only a concern if there are definite patterns.

#Create histogram and  density plot of the residuals
plot(density(resid(model2)))

The plot appears to skew left, further investigation is required.

#Create a QQ plotqqPlot(model, main="QQ Plot") #qq plot for studentized resid 
car::qqPlot(model1, main="QQ Plot") #qq plot for studentized resid

## [1]  69 668

There seem to be a lot of points that deviate from the standard distribution, further investigation is required.

#We can make our decision based on the standardized score for skew and kurtosis
#We divide the skew statistic by the standard error to get the standardized score
#This will tell us if we have a problem
popskew<-semTools::skew(resid(model2))
popkurt<-semTools::kurtosis(resid(model2))
popskew[1]/popskew[2]

##    skew (g1) 
## -0.006577716

popkurt[1]/popkurt[2]

## Excess Kur (g2) 
##       -3.828673

#Calculating the percentage of standardized scores for the variable itself that are outside our acceptable range
#this will tell us how big a problem we have
# Calculate the percentage of standardized scores that are greater than 1.96

zpop<- abs(scale(resid(model2)))

FSA::perc(as.numeric(zpop), 1.96, "gt")

## Registered S3 methods overwritten by 'FSA':
##   method       from
##   confint.boot car 
##   hist.boot    car

## [1] 2.599179

FSA::perc(as.numeric(zpop), 3.29, "gt")

## [1] 0

95.86% of standardized scores falling within +/- 1.96 sd. The standard is 95% of standardized scores falling within +/- 1.96. However this falls short but is approximately 95% meaninf the distribution of residuals may be taken to be normal minimizing the possibilty of bias and the possibility of Type I error.

Collinearity occurs when two or more independent variables contain strongly redundant information Conducting MLR with collinear variables will produce invalid results.

#Calculate Collinearity
vifmodel<-car::vif(model2)
vifmodel

##        bikeshare$hum bikeshare$weathersit 
##             1.536886             1.536886

#Calculate tolerance
1/vifmodel

##        bikeshare$hum bikeshare$weathersit 
##            0.6506663            0.6506663