library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(StatDA)
## Loading required package: sgeostat
## Registered S3 method overwritten by 'geoR':
## method from
## plot.variogram sgeostat
library(dvmisc)
## Loading required package: rbenchmark
##install all packages
Dataset_Who <- read_excel("C:/Users/zifen/OneDrive/Desktop/Regression analysis/world-happiness-report-2021 removed.xlsx")
colnames(Dataset_Who) <- c("Country","y", "x1", "x2", "x3", "x4", "x5", "x6")
View(Dataset_Who)
#scatterplot of y against x1
Dataset_Who_x1 <- ggplot(data = Dataset_Who,
aes(x=x1,y=y)) + geom_point() + geom_smooth(method = "lm", se= FALSE)+ labs(title= "Scatterplot of Ladder Score against Logged GDP per Capita")
print(Dataset_Who_x1)
## `geom_smooth()` using formula 'y ~ x'

Correlation_Coefficient_x1 <- cor(Dataset_Who$x1, Dataset_Who$y)
print(Correlation_Coefficient_x1)
## [1] 0.7897597
#scatterplot of y against x2
Dataset_Who_x2 <- ggplot(data = Dataset_Who,
aes(x=x2,y=y)) + geom_point() + geom_smooth(method = "lm", se= FALSE)+ labs(title= "Scatterplot of Ladder Score against Social Support")
print(Dataset_Who_x2)
## `geom_smooth()` using formula 'y ~ x'

Correlation_Coefficient_x2 <- cor(Dataset_Who$x2, Dataset_Who$y)
print(Correlation_Coefficient_x2)
## [1] 0.7568876
#Scatterplot of y against x3
Dataset_Who_x3 <- ggplot(data = Dataset_Who,
aes(x=x3,y=y)) + geom_point() + geom_smooth(method = "lm", se= FALSE)+ labs(title= "Scatterplot of Ladder Score against Health Life Expectancy")
print(Dataset_Who_x3)
## `geom_smooth()` using formula 'y ~ x'

Correlation_Coefficient_x3 <- cor(Dataset_Who$x3, Dataset_Who$y)
print(Correlation_Coefficient_x3)
## [1] 0.7680995
#scatterplot of y against x4
Dataset_Who_x4 <- ggplot(data = Dataset_Who,
aes(x=x4,y=y)) + geom_point() + geom_smooth(method = "lm", se= FALSE)+ labs(title= "Scatterplot of Ladder Score against Freedom to Make Life Choices")
print(Dataset_Who_x4)
## `geom_smooth()` using formula 'y ~ x'

Correlation_Coefficient_x4 <- cor(Dataset_Who$x4, Dataset_Who$y)
print(Correlation_Coefficient_x4)
## [1] 0.6077531
#scatterplot of y against x5
Dataset_Who_x5 <- ggplot(data = Dataset_Who,
aes(x=x5,y=y)) + geom_point() + geom_smooth(method = "lm", se= FALSE)+ labs(title= "Scatterplot of Ladder Score against Generosity")
print(Dataset_Who_x5)
## `geom_smooth()` using formula 'y ~ x'

Correlation_Coefficient_x5 <- cor(Dataset_Who$x5, Dataset_Who$y)
print(Correlation_Coefficient_x5)
## [1] -0.01779928
#scatterplot of y against x6
Dataset_Who_x6 <- ggplot(data = Dataset_Who,
aes(x=x6,y=y)) + geom_point() + geom_smooth(method = "lm", se= FALSE)+ labs(title= "Scatterplot of Ladder Score against Percentage of Corruption")
print(Dataset_Who_x6)
## `geom_smooth()` using formula 'y ~ x'

Correlation_Coefficient_x6 <- cor(Dataset_Who$x6, Dataset_Who$y)
print(Correlation_Coefficient_x6)
## [1] -0.42114
#overall scatterplot matrix
pairs(Dataset_Who[2:8], main= "Scatterplot Martix World Happiness 2021")

#data analysis
model1 <- glm(y ~ x1+x2+x3+x4+x5+x6 , data = Dataset_Who, family = "gaussian")
summary(model1)
##
## Call:
## glm(formula = y ~ x1 + x2 + x3 + x4 + x5 + x6, family = "gaussian",
## data = Dataset_Who)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.85049 -0.30026 0.05735 0.33368 1.04878
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.23722 0.63049 -3.548 0.000526 ***
## x1 0.27953 0.08684 3.219 0.001595 **
## x2 2.47621 0.66822 3.706 0.000301 ***
## x3 0.03031 0.01333 2.274 0.024494 *
## x4 2.01046 0.49480 4.063 7.98e-05 ***
## x5 0.36438 0.32121 1.134 0.258541
## x6 -0.60509 0.29051 -2.083 0.039058 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.2934823)
##
## Null deviance: 170.690 on 148 degrees of freedom
## Residual deviance: 41.674 on 142 degrees of freedom
## AIC: 249.01
##
## Number of Fisher Scoring iterations: 2
anova(model1)
## Analysis of Deviance Table
##
## Model: gaussian, link: identity
##
## Response: y
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 148 170.690
## x1 1 106.463 147 64.227
## x2 1 8.320 146 55.907
## x3 1 3.476 145 52.430
## x4 1 8.769 144 43.661
## x5 1 0.713 143 42.948
## x6 1 1.273 142 41.674
confint(model1)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -3.472963744 -1.00147484
## x1 0.109329843 0.44973595
## x2 1.166515395 3.78589631
## x3 0.004180908 0.05644672
## x4 1.040665955 2.98026345
## x5 -0.265184293 0.99394818
## x6 -1.174482103 -0.03570143
##Since the p-value for x5 is more than 0.05, geneoristy does not contribute significantly to the model.
#model without x1
model3 <- glm(y ~ x2+x3+x4+x5+x6 , data = Dataset_Who, family = "gaussian")
summary(model3)
##
## Call:
## glm(formula = y ~ x2 + x3 + x4 + x5 + x6, family = "gaussian",
## data = Dataset_Who)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.63771 -0.26591 0.02327 0.37120 1.37329
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.02782 0.64733 -3.133 0.002102 **
## x2 3.49701 0.60713 5.760 4.96e-08 ***
## x3 0.05741 0.01067 5.380 2.98e-07 ***
## x4 1.93916 0.51023 3.801 0.000213 ***
## x5 0.20131 0.32741 0.615 0.539620
## x6 -0.75876 0.29579 -2.565 0.011343 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.3126954)
##
## Null deviance: 170.690 on 148 degrees of freedom
## Residual deviance: 44.715 on 143 degrees of freedom
## AIC: 257.5
##
## Number of Fisher Scoring iterations: 2
#model without x2
model4 <- glm(y ~ x1+x3+x4+x5+x6 , data = Dataset_Who, family = "gaussian")
summary(model4)
##
## Call:
## glm(formula = y ~ x1 + x3 + x4 + x5 + x6, family = "gaussian",
## data = Dataset_Who)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9650 -0.3618 0.1465 0.3708 0.9113
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.65343 0.64744 -4.098 6.94e-05 ***
## x1 0.43225 0.07977 5.419 2.48e-07 ***
## x3 0.03689 0.01379 2.675 0.00835 **
## x4 2.50740 0.49704 5.045 1.36e-06 ***
## x5 0.41226 0.33494 1.231 0.22039
## x6 -0.36662 0.29564 -1.240 0.21697
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.3196124)
##
## Null deviance: 170.690 on 148 degrees of freedom
## Residual deviance: 45.705 on 143 degrees of freedom
## AIC: 260.76
##
## Number of Fisher Scoring iterations: 2
#model without x3
model5 <- glm(y ~ x1+x2+x4+x5+x6 , data = Dataset_Who, family = "gaussian")
summary(model5)
##
## Call:
## glm(formula = y ~ x1 + x2 + x4 + x5 + x6, family = "gaussian",
## data = Dataset_Who)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.07031 -0.32094 0.06193 0.38249 1.05628
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.65809 0.58509 -2.834 0.005264 **
## x1 0.40419 0.06831 5.917 2.32e-08 ***
## x2 2.67826 0.67187 3.986 0.000107 ***
## x4 2.15165 0.49800 4.321 2.90e-05 ***
## x5 0.31849 0.32522 0.979 0.329083
## x6 -0.69012 0.29226 -2.361 0.019559 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.3020383)
##
## Null deviance: 170.690 on 148 degrees of freedom
## Residual deviance: 43.191 on 143 degrees of freedom
## AIC: 252.34
##
## Number of Fisher Scoring iterations: 2
#model without x4
model6 <- glm(y ~ x1+x2+x3+x5+x6 , data = Dataset_Who, family = "gaussian")
summary(model6)
##
## Call:
## glm(formula = y ~ x1 + x2 + x3 + x5 + x6, family = "gaussian",
## data = Dataset_Who)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.62939 -0.32554 0.04462 0.36965 1.24574
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.30347 0.61814 -2.109 0.03671 *
## x1 0.26374 0.09134 2.888 0.00449 **
## x2 3.21205 0.67720 4.743 5.04e-06 ***
## x3 0.03711 0.01393 2.665 0.00859 **
## x5 0.64851 0.33007 1.965 0.05138 .
## x6 -0.92182 0.29464 -3.129 0.00213 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.3253122)
##
## Null deviance: 170.69 on 148 degrees of freedom
## Residual deviance: 46.52 on 143 degrees of freedom
## AIC: 263.4
##
## Number of Fisher Scoring iterations: 2
#model without x5
model2 <- glm(y ~ x1+x2+x3+x4+x6 , data = Dataset_Who, family = "gaussian")
summary(model2)
##
## Call:
## glm(formula = y ~ x1 + x2 + x3 + x4 + x6, family = "gaussian",
## data = Dataset_Who)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.93303 -0.29768 0.06863 0.33924 1.02304
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.11039 0.62112 -3.398 0.000880 ***
## x1 0.26400 0.08584 3.075 0.002518 **
## x2 2.50670 0.66835 3.751 0.000256 ***
## x3 0.02936 0.01332 2.204 0.029095 *
## x4 2.13266 0.48342 4.412 2.01e-05 ***
## x6 -0.66778 0.28549 -2.339 0.020718 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.294071)
##
## Null deviance: 170.690 on 148 degrees of freedom
## Residual deviance: 42.052 on 143 degrees of freedom
## AIC: 248.35
##
## Number of Fisher Scoring iterations: 2
#model without x6
model7 <- glm(y ~ x1+x2+x3+x4+x5 , data = Dataset_Who, family = "gaussian")
summary(model7)
##
## Call:
## glm(formula = y ~ x1 + x2 + x3 + x4 + x5, family = "gaussian",
## data = Dataset_Who)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.87399 -0.30110 0.07275 0.36477 1.07979
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.15589 0.45577 -6.924 1.37e-10 ***
## x1 0.30925 0.08665 3.569 0.000489 ***
## x2 2.16789 0.65918 3.289 0.001267 **
## x3 0.03389 0.01338 2.534 0.012367 *
## x4 2.28700 0.48219 4.743 5.05e-06 ***
## x5 0.49164 0.31901 1.541 0.125488
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.3003336)
##
## Null deviance: 170.690 on 148 degrees of freedom
## Residual deviance: 42.948 on 143 degrees of freedom
## AIC: 251.49
##
## Number of Fisher Scoring iterations: 2
#model after rejecting x5
Nice_Model <- glm(y ~ x1+x2+x3+x4+x6 , data = Dataset_Who, family = "gaussian")
summary(Nice_Model)
##
## Call:
## glm(formula = y ~ x1 + x2 + x3 + x4 + x6, family = "gaussian",
## data = Dataset_Who)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.93303 -0.29768 0.06863 0.33924 1.02304
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.11039 0.62112 -3.398 0.000880 ***
## x1 0.26400 0.08584 3.075 0.002518 **
## x2 2.50670 0.66835 3.751 0.000256 ***
## x3 0.02936 0.01332 2.204 0.029095 *
## x4 2.13266 0.48342 4.412 2.01e-05 ***
## x6 -0.66778 0.28549 -2.339 0.020718 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.294071)
##
## Null deviance: 170.690 on 148 degrees of freedom
## Residual deviance: 42.052 on 143 degrees of freedom
## AIC: 248.35
##
## Number of Fisher Scoring iterations: 2
vif(Nice_Model)
## x1 x2 x3 x4 x6
## 4.977918 2.967390 4.083158 1.510650 1.317658
#model adequecy checking
Dataset_Who$Fitted <- predict(Nice_Model) #yi hat #creates a new column "fitted"
Dataset_Who$ei <- resid(Nice_Model) #Residuals, yi-yihat
mse <- get_mse(Nice_Model) #mean square error for fitted model #dollar sign is to indicate which column
Dataset_Who$di <- Dataset_Who$ei/sqrt(mse) #Standardized Residuals, di = ei/sqrt(mse)
Dataset_Who$ri <- rstandard(Nice_Model) #evaluate studentized res
Dataset_Who$ti <- rstudent(Nice_Model) #evaluate rstudent res
#plotting normal probability plot
qqnorm(Dataset_Who$ei) #qq plot
qqline(Dataset_Who$ei)

qqnorm(Dataset_Who$di)
qqline(Dataset_Who$di)

qqplot.das(Dataset_Who$ei, distribution = "norm", ylab = "empirical quantiles",
xlab = "sample quantiles", main = "Normal QQPLOT for ordinary residuals", las = par("las"),
datax = FALSE, envelope = 0.95, labels = FALSE, col = palette()[2],
lwd = 2, pch = 1, line = c("quartiles", "robust", "none"), cex = 1,
xaxt = "s", add.plot=FALSE,xlim=NULL,ylim=NULL)

qqplot.das(Dataset_Who$di, distribution = "norm", ylab = "empirical quantiles",
xlab = "sample quantiles", main = "Normal QQPLOT for ordinary residuals", las = par("las"),
datax = FALSE, envelope = 0.95, labels = FALSE, col = palette()[2],
lwd = 2, pch = 1, line = c("quartiles", "robust", "none"), cex = 1,
xaxt = "s", add.plot=FALSE,xlim=NULL,ylim=NULL)

#Plot of Residuals versus Fitted
plot(Dataset_Who$Fitted,Dataset_Who$ei)

plot(Dataset_Who$Fitted,Dataset_Who$di)

#Plot of Residuals versus Regressors
plot(Dataset_Who$x1,Dataset_Who$ei)

plot(Dataset_Who$x1,Dataset_Who$di)

plot(Dataset_Who$x2,Dataset_Who$ei)

plot(Dataset_Who$x2,Dataset_Who$di)

plot(Dataset_Who$x3,Dataset_Who$ei)

plot(Dataset_Who$x3,Dataset_Who$di)

plot(Dataset_Who$x4,Dataset_Who$ei)

plot(Dataset_Who$x4,Dataset_Who$di)

plot(Dataset_Who$x6,Dataset_Who$ei)

plot(Dataset_Who$x6,Dataset_Who$di)

plot(Dataset_Who$Fitted,Dataset_Who$ri)

plot(Dataset_Who$Fitted,Dataset_Who$ti)

par(mfrow=c(2,2))
plot(Nice_Model)

#detecting outliers
outlierTest(Nice_Model)
## rstudent unadjusted p-value Bonferroni p
## 16 -3.820122 0.00013339 0.019874
#No Studentized residuals with Bonferonni p < 0.05
#Observation no 16 is the most extreme outlier
Diagonal_Value_hii <- hatvalues(Nice_Model)
CooksDis<- cooks.distance(Nice_Model)
DFFITS <- dffits(Nice_Model)
Table_Of_Data <- data.frame(Diagonal_Value_hii,CooksDis,DFFITS)
View(Table_Of_Data)
k = 5 #5 regressors
n = 149 #149 observations
p = k + 1
Leverage_Standard <- 2*(p/n)
print(Leverage_Standard)
## [1] 0.08053691
Cook_Standard <- 1
print(Cook_Standard)
## [1] 1
DFFITS_Standard <- 2*sqrt(p/n)
print(DFFITS_Standard)
## [1] 0.40134
#Leverage check
Leverage_Check <- Table_Of_Data[Table_Of_Data$Diagonal_Value_hii > Leverage_Standard,]
Leverage_Check[1]
## Diagonal_Value_hii
## 1 0.13030591
## 3 0.08298383
## 13 0.09245673
## 20 0.08198802
## 63 0.08254215
## 92 0.09746705
## 115 0.15210900
## 120 0.09923634
## 127 0.10493130
## 147 0.08717018
Cooks_Check <- Table_Of_Data[Table_Of_Data$CooksDis > Cook_Standard,]
Cooks_Check[2]
## [1] CooksDis
## <0 rows> (or 0-length row.names)
Dffits_Check <- Table_Of_Data[abs(Table_Of_Data$DFFITS) > DFFITS_Standard, ]
Dffits_Check[3]
## DFFITS
## 13 0.6335812
## 16 -0.8438513
## 24 0.4056425
## 53 -0.5042674
## 56 -0.5850135
## 63 0.5968525
## 75 -0.4481072
## 115 -1.2997607
## 120 -0.6652617
## 132 -0.4345952
## 137 -0.4374949
## 147 -0.4458350
Dataset_Who$squarex1 <- (Dataset_Who$x1^2)
Dataset_Who$squarex2 <- (Dataset_Who$x2^2)
Dataset_Who$squarex3 <- (Dataset_Who$x3^2)
Dataset_Who$squarex4 <- (Dataset_Who$x4^2)
Dataset_Who$squarex6 <- (Dataset_Who$x6^2)
Transformer <- glm(Dataset_Who$y ~ squarex1 + squarex2 +squarex3 + squarex4 + squarex6, data = Dataset_Who)
summary(Transformer)
##
## Call:
## glm(formula = Dataset_Who$y ~ squarex1 + squarex2 + squarex3 +
## squarex4 + squarex6, data = Dataset_Who)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.87056 -0.28479 0.02956 0.34383 1.05217
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.447239 0.335533 4.313 2.98e-05 ***
## squarex1 0.013824 0.004634 2.983 0.00335 **
## squarex2 1.766118 0.433414 4.075 7.60e-05 ***
## squarex3 0.000235 0.000105 2.239 0.02668 *
## squarex4 1.409663 0.316560 4.453 1.69e-05 ***
## squarex6 -0.468908 0.234149 -2.003 0.04711 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.2829939)
##
## Null deviance: 170.690 on 148 degrees of freedom
## Residual deviance: 40.468 on 143 degrees of freedom
## AIC: 242.63
##
## Number of Fisher Scoring iterations: 2
Dataset_Who$transformer_fits <- fitted(Transformer)
Dataset_Who$ei_transformer <- Dataset_Who$y - Dataset_Who$transformer_fits
mse <- get_mse(Transformer)
Dataset_Who$di_transformer <- Dataset_Who$ei_transformer/sqrt(mse)
Histogram_ei <- hist(Dataset_Who$ei_transformer)

print(Histogram_ei)
## $breaks
## [1] -2.0 -1.5 -1.0 -0.5 0.0 0.5 1.0 1.5
##
## $counts
## [1] 1 5 16 46 60 19 2
##
## $density
## [1] 0.01342282 0.06711409 0.21476510 0.61744966 0.80536913 0.25503356 0.02684564
##
## $mids
## [1] -1.75 -1.25 -0.75 -0.25 0.25 0.75 1.25
##
## $xname
## [1] "Dataset_Who$ei_transformer"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
qqplot.das(Dataset_Who$ei_transformer, distribution = "norm", ylab = "empirical quantiles",
xlab = "sample quantiles", main = "Normal QQPLOT for ordinary residuals", las = par("las"),
datax = FALSE, envelope = 0.95, labels = FALSE, col = palette()[2],
lwd = 2, pch = 1, line = c("quartiles", "robust", "none"), cex = 1,
xaxt = "s", add.plot=FALSE,xlim=NULL,ylim=NULL)

qqplot.das(Dataset_Who$di_transformer, distribution = "norm", ylab = "empirical quantiles",
xlab = "sample quantiles", main = "Normal QQPLOT for ordinary residuals", las = par("las"),
datax = FALSE, envelope = 0.95, labels = FALSE, col = palette()[2],
lwd = 2, pch = 1, line = c("quartiles", "robust", "none"), cex = 1,
xaxt = "s", add.plot=FALSE,xlim=NULL,ylim=NULL)
