library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(StatDA)
## Loading required package: sgeostat
## Registered S3 method overwritten by 'geoR':
##   method         from    
##   plot.variogram sgeostat
library(dvmisc)
## Loading required package: rbenchmark
##install all packages 
Dataset_Who <- read_excel("C:/Users/zifen/OneDrive/Desktop/Regression analysis/world-happiness-report-2021 removed.xlsx")
colnames(Dataset_Who) <- c("Country","y", "x1", "x2", "x3", "x4", "x5", "x6")
View(Dataset_Who)


#scatterplot of y against x1
Dataset_Who_x1 <- ggplot(data = Dataset_Who, 
                         aes(x=x1,y=y)) + geom_point() + geom_smooth(method = "lm", se= FALSE)+ labs(title= "Scatterplot of Ladder Score against Logged GDP per Capita")
print(Dataset_Who_x1)
## `geom_smooth()` using formula 'y ~ x'

Correlation_Coefficient_x1 <- cor(Dataset_Who$x1, Dataset_Who$y)
print(Correlation_Coefficient_x1)
## [1] 0.7897597
#scatterplot of y against x2
Dataset_Who_x2 <- ggplot(data = Dataset_Who, 
                         aes(x=x2,y=y)) + geom_point() + geom_smooth(method = "lm", se= FALSE)+ labs(title= "Scatterplot of Ladder Score against Social Support")
print(Dataset_Who_x2)
## `geom_smooth()` using formula 'y ~ x'

Correlation_Coefficient_x2 <- cor(Dataset_Who$x2, Dataset_Who$y)
print(Correlation_Coefficient_x2)
## [1] 0.7568876
#Scatterplot of y against x3
Dataset_Who_x3 <- ggplot(data = Dataset_Who, 
                         aes(x=x3,y=y)) + geom_point() + geom_smooth(method = "lm", se= FALSE)+ labs(title= "Scatterplot of Ladder Score against Health Life Expectancy")
print(Dataset_Who_x3)
## `geom_smooth()` using formula 'y ~ x'

Correlation_Coefficient_x3 <- cor(Dataset_Who$x3, Dataset_Who$y)
print(Correlation_Coefficient_x3)
## [1] 0.7680995
#scatterplot of y against x4
Dataset_Who_x4 <- ggplot(data = Dataset_Who, 
                         aes(x=x4,y=y)) + geom_point() + geom_smooth(method = "lm", se= FALSE)+ labs(title= "Scatterplot of Ladder Score against Freedom to Make Life Choices")
print(Dataset_Who_x4)
## `geom_smooth()` using formula 'y ~ x'

Correlation_Coefficient_x4 <- cor(Dataset_Who$x4, Dataset_Who$y)
print(Correlation_Coefficient_x4)
## [1] 0.6077531
#scatterplot of y against x5
Dataset_Who_x5 <- ggplot(data = Dataset_Who, 
                         aes(x=x5,y=y)) + geom_point() + geom_smooth(method = "lm", se= FALSE)+ labs(title= "Scatterplot of Ladder Score against Generosity")
print(Dataset_Who_x5)
## `geom_smooth()` using formula 'y ~ x'

Correlation_Coefficient_x5 <- cor(Dataset_Who$x5, Dataset_Who$y)
print(Correlation_Coefficient_x5)
## [1] -0.01779928
#scatterplot of y against x6
Dataset_Who_x6 <- ggplot(data = Dataset_Who, 
                         aes(x=x6,y=y)) + geom_point() + geom_smooth(method = "lm", se= FALSE)+ labs(title= "Scatterplot of Ladder Score against Percentage of Corruption")
print(Dataset_Who_x6)
## `geom_smooth()` using formula 'y ~ x'

Correlation_Coefficient_x6 <- cor(Dataset_Who$x6, Dataset_Who$y)
print(Correlation_Coefficient_x6)
## [1] -0.42114
#overall scatterplot matrix
pairs(Dataset_Who[2:8], main= "Scatterplot Martix World Happiness 2021") 

#data analysis

model1 <- glm(y ~ x1+x2+x3+x4+x5+x6 , data = Dataset_Who, family = "gaussian")
summary(model1)
## 
## Call:
## glm(formula = y ~ x1 + x2 + x3 + x4 + x5 + x6, family = "gaussian", 
##     data = Dataset_Who)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.85049  -0.30026   0.05735   0.33368   1.04878  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.23722    0.63049  -3.548 0.000526 ***
## x1           0.27953    0.08684   3.219 0.001595 ** 
## x2           2.47621    0.66822   3.706 0.000301 ***
## x3           0.03031    0.01333   2.274 0.024494 *  
## x4           2.01046    0.49480   4.063 7.98e-05 ***
## x5           0.36438    0.32121   1.134 0.258541    
## x6          -0.60509    0.29051  -2.083 0.039058 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.2934823)
## 
##     Null deviance: 170.690  on 148  degrees of freedom
## Residual deviance:  41.674  on 142  degrees of freedom
## AIC: 249.01
## 
## Number of Fisher Scoring iterations: 2
anova(model1)
## Analysis of Deviance Table
## 
## Model: gaussian, link: identity
## 
## Response: y
## 
## Terms added sequentially (first to last)
## 
## 
##      Df Deviance Resid. Df Resid. Dev
## NULL                   148    170.690
## x1    1  106.463       147     64.227
## x2    1    8.320       146     55.907
## x3    1    3.476       145     52.430
## x4    1    8.769       144     43.661
## x5    1    0.713       143     42.948
## x6    1    1.273       142     41.674
confint(model1)
## Waiting for profiling to be done...
##                    2.5 %      97.5 %
## (Intercept) -3.472963744 -1.00147484
## x1           0.109329843  0.44973595
## x2           1.166515395  3.78589631
## x3           0.004180908  0.05644672
## x4           1.040665955  2.98026345
## x5          -0.265184293  0.99394818
## x6          -1.174482103 -0.03570143
##Since the p-value for x5 is more than 0.05, geneoristy does not contribute significantly to the model.

#model without x1
model3 <- glm(y ~ x2+x3+x4+x5+x6 , data = Dataset_Who, family = "gaussian")
summary(model3)
## 
## Call:
## glm(formula = y ~ x2 + x3 + x4 + x5 + x6, family = "gaussian", 
##     data = Dataset_Who)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.63771  -0.26591   0.02327   0.37120   1.37329  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.02782    0.64733  -3.133 0.002102 ** 
## x2           3.49701    0.60713   5.760 4.96e-08 ***
## x3           0.05741    0.01067   5.380 2.98e-07 ***
## x4           1.93916    0.51023   3.801 0.000213 ***
## x5           0.20131    0.32741   0.615 0.539620    
## x6          -0.75876    0.29579  -2.565 0.011343 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.3126954)
## 
##     Null deviance: 170.690  on 148  degrees of freedom
## Residual deviance:  44.715  on 143  degrees of freedom
## AIC: 257.5
## 
## Number of Fisher Scoring iterations: 2
#model without x2
model4 <- glm(y ~ x1+x3+x4+x5+x6 , data = Dataset_Who, family = "gaussian")
summary(model4)
## 
## Call:
## glm(formula = y ~ x1 + x3 + x4 + x5 + x6, family = "gaussian", 
##     data = Dataset_Who)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9650  -0.3618   0.1465   0.3708   0.9113  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.65343    0.64744  -4.098 6.94e-05 ***
## x1           0.43225    0.07977   5.419 2.48e-07 ***
## x3           0.03689    0.01379   2.675  0.00835 ** 
## x4           2.50740    0.49704   5.045 1.36e-06 ***
## x5           0.41226    0.33494   1.231  0.22039    
## x6          -0.36662    0.29564  -1.240  0.21697    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.3196124)
## 
##     Null deviance: 170.690  on 148  degrees of freedom
## Residual deviance:  45.705  on 143  degrees of freedom
## AIC: 260.76
## 
## Number of Fisher Scoring iterations: 2
#model without x3
model5 <- glm(y ~ x1+x2+x4+x5+x6 , data = Dataset_Who, family = "gaussian")
summary(model5)
## 
## Call:
## glm(formula = y ~ x1 + x2 + x4 + x5 + x6, family = "gaussian", 
##     data = Dataset_Who)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.07031  -0.32094   0.06193   0.38249   1.05628  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.65809    0.58509  -2.834 0.005264 ** 
## x1           0.40419    0.06831   5.917 2.32e-08 ***
## x2           2.67826    0.67187   3.986 0.000107 ***
## x4           2.15165    0.49800   4.321 2.90e-05 ***
## x5           0.31849    0.32522   0.979 0.329083    
## x6          -0.69012    0.29226  -2.361 0.019559 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.3020383)
## 
##     Null deviance: 170.690  on 148  degrees of freedom
## Residual deviance:  43.191  on 143  degrees of freedom
## AIC: 252.34
## 
## Number of Fisher Scoring iterations: 2
#model without x4
model6 <- glm(y ~ x1+x2+x3+x5+x6 , data = Dataset_Who, family = "gaussian")
summary(model6)
## 
## Call:
## glm(formula = y ~ x1 + x2 + x3 + x5 + x6, family = "gaussian", 
##     data = Dataset_Who)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.62939  -0.32554   0.04462   0.36965   1.24574  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.30347    0.61814  -2.109  0.03671 *  
## x1           0.26374    0.09134   2.888  0.00449 ** 
## x2           3.21205    0.67720   4.743 5.04e-06 ***
## x3           0.03711    0.01393   2.665  0.00859 ** 
## x5           0.64851    0.33007   1.965  0.05138 .  
## x6          -0.92182    0.29464  -3.129  0.00213 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.3253122)
## 
##     Null deviance: 170.69  on 148  degrees of freedom
## Residual deviance:  46.52  on 143  degrees of freedom
## AIC: 263.4
## 
## Number of Fisher Scoring iterations: 2
#model without x5
model2 <- glm(y ~ x1+x2+x3+x4+x6 , data = Dataset_Who, family = "gaussian")
summary(model2)
## 
## Call:
## glm(formula = y ~ x1 + x2 + x3 + x4 + x6, family = "gaussian", 
##     data = Dataset_Who)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.93303  -0.29768   0.06863   0.33924   1.02304  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.11039    0.62112  -3.398 0.000880 ***
## x1           0.26400    0.08584   3.075 0.002518 ** 
## x2           2.50670    0.66835   3.751 0.000256 ***
## x3           0.02936    0.01332   2.204 0.029095 *  
## x4           2.13266    0.48342   4.412 2.01e-05 ***
## x6          -0.66778    0.28549  -2.339 0.020718 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.294071)
## 
##     Null deviance: 170.690  on 148  degrees of freedom
## Residual deviance:  42.052  on 143  degrees of freedom
## AIC: 248.35
## 
## Number of Fisher Scoring iterations: 2
#model without x6
model7 <- glm(y ~ x1+x2+x3+x4+x5 , data = Dataset_Who, family = "gaussian")
summary(model7)
## 
## Call:
## glm(formula = y ~ x1 + x2 + x3 + x4 + x5, family = "gaussian", 
##     data = Dataset_Who)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.87399  -0.30110   0.07275   0.36477   1.07979  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.15589    0.45577  -6.924 1.37e-10 ***
## x1           0.30925    0.08665   3.569 0.000489 ***
## x2           2.16789    0.65918   3.289 0.001267 ** 
## x3           0.03389    0.01338   2.534 0.012367 *  
## x4           2.28700    0.48219   4.743 5.05e-06 ***
## x5           0.49164    0.31901   1.541 0.125488    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.3003336)
## 
##     Null deviance: 170.690  on 148  degrees of freedom
## Residual deviance:  42.948  on 143  degrees of freedom
## AIC: 251.49
## 
## Number of Fisher Scoring iterations: 2
#model after rejecting x5 
Nice_Model <- glm(y ~ x1+x2+x3+x4+x6 , data = Dataset_Who, family = "gaussian")
summary(Nice_Model)
## 
## Call:
## glm(formula = y ~ x1 + x2 + x3 + x4 + x6, family = "gaussian", 
##     data = Dataset_Who)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.93303  -0.29768   0.06863   0.33924   1.02304  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.11039    0.62112  -3.398 0.000880 ***
## x1           0.26400    0.08584   3.075 0.002518 ** 
## x2           2.50670    0.66835   3.751 0.000256 ***
## x3           0.02936    0.01332   2.204 0.029095 *  
## x4           2.13266    0.48342   4.412 2.01e-05 ***
## x6          -0.66778    0.28549  -2.339 0.020718 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.294071)
## 
##     Null deviance: 170.690  on 148  degrees of freedom
## Residual deviance:  42.052  on 143  degrees of freedom
## AIC: 248.35
## 
## Number of Fisher Scoring iterations: 2
vif(Nice_Model)
##       x1       x2       x3       x4       x6 
## 4.977918 2.967390 4.083158 1.510650 1.317658
#model adequecy checking

Dataset_Who$Fitted <- predict(Nice_Model) #yi hat #creates a new column "fitted"

Dataset_Who$ei <- resid(Nice_Model)    #Residuals, yi-yihat

mse <- get_mse(Nice_Model) #mean square error for fitted model #dollar sign is to indicate which column
Dataset_Who$di <- Dataset_Who$ei/sqrt(mse)   #Standardized Residuals, di = ei/sqrt(mse)
Dataset_Who$ri <- rstandard(Nice_Model) #evaluate studentized res
Dataset_Who$ti <- rstudent(Nice_Model) #evaluate rstudent res

#plotting normal probability plot
qqnorm(Dataset_Who$ei) #qq plot
qqline(Dataset_Who$ei)

qqnorm(Dataset_Who$di)
qqline(Dataset_Who$di)

qqplot.das(Dataset_Who$ei, distribution = "norm", ylab = "empirical quantiles", 
           xlab = "sample quantiles", main = "Normal QQPLOT for ordinary residuals", las = par("las"), 
           datax = FALSE, envelope = 0.95, labels = FALSE, col = palette()[2], 
           lwd = 2, pch = 1, line = c("quartiles", "robust", "none"), cex = 1, 
           xaxt = "s", add.plot=FALSE,xlim=NULL,ylim=NULL)

qqplot.das(Dataset_Who$di, distribution = "norm", ylab = "empirical quantiles", 
           xlab = "sample quantiles", main = "Normal QQPLOT for ordinary residuals", las = par("las"), 
           datax = FALSE, envelope = 0.95, labels = FALSE, col = palette()[2], 
           lwd = 2, pch = 1, line = c("quartiles", "robust", "none"), cex = 1, 
           xaxt = "s", add.plot=FALSE,xlim=NULL,ylim=NULL)

#Plot of Residuals versus Fitted
plot(Dataset_Who$Fitted,Dataset_Who$ei)

plot(Dataset_Who$Fitted,Dataset_Who$di)

#Plot of Residuals versus Regressors
plot(Dataset_Who$x1,Dataset_Who$ei)

plot(Dataset_Who$x1,Dataset_Who$di)

plot(Dataset_Who$x2,Dataset_Who$ei)

plot(Dataset_Who$x2,Dataset_Who$di)

plot(Dataset_Who$x3,Dataset_Who$ei)

plot(Dataset_Who$x3,Dataset_Who$di)

plot(Dataset_Who$x4,Dataset_Who$ei)

plot(Dataset_Who$x4,Dataset_Who$di)

plot(Dataset_Who$x6,Dataset_Who$ei)

plot(Dataset_Who$x6,Dataset_Who$di)

plot(Dataset_Who$Fitted,Dataset_Who$ri)

plot(Dataset_Who$Fitted,Dataset_Who$ti)

par(mfrow=c(2,2))
plot(Nice_Model)

#detecting outliers

outlierTest(Nice_Model)
##     rstudent unadjusted p-value Bonferroni p
## 16 -3.820122         0.00013339     0.019874
#No Studentized residuals with Bonferonni p < 0.05
#Observation no 16 is the most extreme outlier

Diagonal_Value_hii <- hatvalues(Nice_Model)
CooksDis<- cooks.distance(Nice_Model)
DFFITS <- dffits(Nice_Model)
Table_Of_Data <- data.frame(Diagonal_Value_hii,CooksDis,DFFITS)
View(Table_Of_Data)

k = 5 #5 regressors
n = 149 #149 observations
p = k + 1

Leverage_Standard <- 2*(p/n)
print(Leverage_Standard)
## [1] 0.08053691
Cook_Standard <- 1
print(Cook_Standard)
## [1] 1
DFFITS_Standard <- 2*sqrt(p/n)
print(DFFITS_Standard)
## [1] 0.40134
#Leverage check 
Leverage_Check <- Table_Of_Data[Table_Of_Data$Diagonal_Value_hii > Leverage_Standard,]
Leverage_Check[1]
##     Diagonal_Value_hii
## 1           0.13030591
## 3           0.08298383
## 13          0.09245673
## 20          0.08198802
## 63          0.08254215
## 92          0.09746705
## 115         0.15210900
## 120         0.09923634
## 127         0.10493130
## 147         0.08717018
Cooks_Check <- Table_Of_Data[Table_Of_Data$CooksDis > Cook_Standard,]
Cooks_Check[2]
## [1] CooksDis
## <0 rows> (or 0-length row.names)
Dffits_Check <- Table_Of_Data[abs(Table_Of_Data$DFFITS) > DFFITS_Standard, ]
Dffits_Check[3]
##         DFFITS
## 13   0.6335812
## 16  -0.8438513
## 24   0.4056425
## 53  -0.5042674
## 56  -0.5850135
## 63   0.5968525
## 75  -0.4481072
## 115 -1.2997607
## 120 -0.6652617
## 132 -0.4345952
## 137 -0.4374949
## 147 -0.4458350
Dataset_Who$squarex1 <- (Dataset_Who$x1^2)
Dataset_Who$squarex2 <- (Dataset_Who$x2^2)
Dataset_Who$squarex3 <- (Dataset_Who$x3^2)
Dataset_Who$squarex4 <- (Dataset_Who$x4^2)
Dataset_Who$squarex6 <- (Dataset_Who$x6^2)
Transformer <- glm(Dataset_Who$y ~ squarex1 + squarex2 +squarex3 + squarex4 + squarex6, data = Dataset_Who)
summary(Transformer)
## 
## Call:
## glm(formula = Dataset_Who$y ~ squarex1 + squarex2 + squarex3 + 
##     squarex4 + squarex6, data = Dataset_Who)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.87056  -0.28479   0.02956   0.34383   1.05217  
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.447239   0.335533   4.313 2.98e-05 ***
## squarex1     0.013824   0.004634   2.983  0.00335 ** 
## squarex2     1.766118   0.433414   4.075 7.60e-05 ***
## squarex3     0.000235   0.000105   2.239  0.02668 *  
## squarex4     1.409663   0.316560   4.453 1.69e-05 ***
## squarex6    -0.468908   0.234149  -2.003  0.04711 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.2829939)
## 
##     Null deviance: 170.690  on 148  degrees of freedom
## Residual deviance:  40.468  on 143  degrees of freedom
## AIC: 242.63
## 
## Number of Fisher Scoring iterations: 2
Dataset_Who$transformer_fits <- fitted(Transformer)
Dataset_Who$ei_transformer <- Dataset_Who$y - Dataset_Who$transformer_fits
mse <- get_mse(Transformer)
Dataset_Who$di_transformer <- Dataset_Who$ei_transformer/sqrt(mse)
Histogram_ei <- hist(Dataset_Who$ei_transformer)

print(Histogram_ei)
## $breaks
## [1] -2.0 -1.5 -1.0 -0.5  0.0  0.5  1.0  1.5
## 
## $counts
## [1]  1  5 16 46 60 19  2
## 
## $density
## [1] 0.01342282 0.06711409 0.21476510 0.61744966 0.80536913 0.25503356 0.02684564
## 
## $mids
## [1] -1.75 -1.25 -0.75 -0.25  0.25  0.75  1.25
## 
## $xname
## [1] "Dataset_Who$ei_transformer"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
qqplot.das(Dataset_Who$ei_transformer, distribution = "norm", ylab = "empirical quantiles", 
           xlab = "sample quantiles", main = "Normal QQPLOT for ordinary residuals", las = par("las"), 
           datax = FALSE, envelope = 0.95, labels = FALSE, col = palette()[2], 
           lwd = 2, pch = 1, line = c("quartiles", "robust", "none"), cex = 1, 
           xaxt = "s", add.plot=FALSE,xlim=NULL,ylim=NULL)

qqplot.das(Dataset_Who$di_transformer, distribution = "norm", ylab = "empirical quantiles", 
           xlab = "sample quantiles", main = "Normal QQPLOT for ordinary residuals", las = par("las"), 
           datax = FALSE, envelope = 0.95, labels = FALSE, col = palette()[2], 
           lwd = 2, pch = 1, line = c("quartiles", "robust", "none"), cex = 1, 
           xaxt = "s", add.plot=FALSE,xlim=NULL,ylim=NULL)