1.0 Introduction

library(readxl) # To read files from Microsoft Excel
library(fitdistrplus) # To plot histogram
## Warning: package 'fitdistrplus' was built under R version 4.3.3
## Loading required package: MASS
## Loading required package: survival
library(psych) # To plot scatterplot and correlation matrix
library(tidyverse) # For data manipulation and visualization
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'forcats' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%()   masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ dplyr::select()  masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stats4) # To perform maximum likelihood estimation 
library(gmm) # To perform generalised method of moment
## Warning: package 'gmm' was built under R version 4.3.3
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.3.3
library(dvmisc) # To obtain mean squared error
## Warning: package 'dvmisc' was built under R version 4.3.3
## Loading required package: rbenchmark
## 
## Attaching package: 'dvmisc'
## 
## The following object is masked from 'package:tidyr':
## 
##     expand_grid
## 
## The following object is masked from 'package:psych':
## 
##     headtail
library(corrplot) # Visualize the correlation matrix
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.92 loaded
library(car) # To print VIF
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
## The following object is masked from 'package:psych':
## 
##     logit
library(StatDA) # To generate qq-plot
## Warning: package 'StatDA' was built under R version 4.3.3
## Loading required package: sgeostat
## Registered S3 method overwritten by 'geoR':
##   method         from    
##   plot.variogram sgeostat
library(ggplot2)

2.0 Data Description

## Import dataset
Sampledata <- read_excel("Assignment dataset.xlsx")
View(Sampledata)

## Check the structure of the Sampledata
str(Sampledata)
## tibble [500 × 7] (S3: tbl_df/tbl/data.frame)
##  $ House Price       : num [1:500] 221900 538000 180000 604000 510000 ...
##  $ x                 : num [1:500] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Number of Bedrooms: num [1:500] 3 3 2 4 3 4 3 3 3 3 ...
##  $ Number of Floors  : num [1:500] 1 2 1 1 1 1 2 1 1 2 ...
##  $ Grade             : num [1:500] 7 7 6 7 8 11 7 7 7 7 ...
##  $ Year of Completion: num [1:500] 1955 1951 1933 1965 1987 ...
##  $ Lot Size          : num [1:500] 5650 7639 8062 5000 7503 ...
## Remove the unnecessary column
Data1 <- subset(Sampledata,select = -c(x))
View(Data1)

### Change the column names of Data1
## y = House price
## x1 = Number of bedrooms
## x2 = Number of floors
## x3 = Grade
## x4 = Year of completion
## x5 = Lot size
colnames(Data1) <- c("y","x1","x2","x3","x4","x5")

3.0 Data Analysis

3.1 Number of Bedrooms (x1)

##Plot boxplot for Number of Bedroom (x1)
boxplot(Data1$x1, 
        col = "#1F3552",
        main = "Boxplot of Number of Bedroom (x1)",
        xlab = "Number of Bedroom (x1)",
        horizontal = TRUE)

print(summary(Data1$x1), digits = 10)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   3.000   3.000   3.366   4.000   6.000

3.2 Number of Floors (x2)

##Plot boxplot for Number of Floors (x2)
boxplot(Data1$x2, 
        col = "#1F3552",
        main = "Boxplot of Number of Floors (x2)",
        xlab = "Number of Floors (x2)",
        horizontal = TRUE)

print(summary(Data1$x2), digits = 10)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   1.000   1.445   2.000   3.000

3.3 Grade (x3)

##Plot boxplot for Grade (x3)
boxplot(Data1$x3, 
        col = "#1F3552",
        main = "Boxplot of Grade (x3)",
        xlab = "Grade (x3)",
        horizontal = TRUE)

print(summary(Data1$x3), digits = 10)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.000   7.000   7.000   7.654   8.000  12.000

3.4 Year of Completion (x4)

##Plot boxplot for Year of Completion (x4)
boxplot(Data1$x4, 
        col = "#1F3552",
        main = "Boxplot of Year of Completion (x4)",
        xlab = "Year of Completion (x4)",
        horizontal = TRUE)

print(summary(Data1$x4), digits = 10)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 1900.000 1953.000 1972.000 1968.512 1991.000 2014.000

3.5 Lot Size (x5)

##Plot boxplot for Lot Size (x5)
boxplot(Data1$x5, 
        col = "#1F3552",
        main = "Boxplot of Lot Size (x5)",
        xlab = "Lot Size (x5)",
        horizontal = TRUE)

print(summary(Data1$x5), digits = 10)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##   1106.00   5569.00   8130.50  13774.21  11014.00 211267.00

3.6 House Price (y)

### Plot scatterplot of House Price (y)
plot(1:500,Data1$y,
     main = "Scatterplot of House Price (y)", 
     xlab = "Datapoint",
     ylab = "House Price (y)", 
     pch = 21, 
     col = "blue",
     bg = "blue")

### Plot histogram of House Price (y)
fn<-fitdist(Data1$y,"norm")
plot.legend<-c("Normal")
denscomp(list(fn),
         legendtext = plot.legend,
         fitcol = "red",
         datacol = "#1F3552",
         main = "Histogram of House Price (y)", 
         ylab = "Datapoint",
         xlab = "House Price (y)")

3.7 Scatterplot Matrix

### Plot scatterplot matrix
pairs(Data1,
      upper.panel = NULL,
      cex.labels = 1.25,
      main = "Scatterplot Matrix")

### Plot scatterplot and correlation matrix
pairs.panels(Data1, 
             method = "pearson", 
             main = "Scatterplot and Correlation Matrix")

4.0 Parameters Estimation

4.1 Maximum Likelihood Function (MLE)

## Estimation of parameters (mu and sigma) using Maximum Likelihood Estimation (MLE)
### Obtain the likelihood function 
likelihood_function <-function(mu,sigma){
  -sum(dnorm(Data1$y,
             mean=mu,
             sd=sigma,
             log=TRUE))
}

### Determine the paramter using mle function
normal_fit<-mle(likelihood_function,
                start=list(mu=0,sigma=1),
                method='L-BFGS-B')
summary(normal_fit)
## Maximum likelihood estimation
## 
## Call:
## mle(minuslogl = likelihood_function, start = list(mu = 0, sigma = 1), 
##     method = "L-BFGS-B")
## 
## Coefficients:
##       Estimate Std. Error
## mu    525570.2   2097.152
## sigma 349905.6   1482.910
## 
## -2 log L: 14184.34

4.2 Liner Model Function

## Estimation of parameters/Coefficients of the Multiple Regression Model,(Bi,i=1,2,3,4,5)
Model<-lm(Data1$y ~ ., data=Data1)
summary(Model)
## 
## Call:
## lm(formula = Data1$y ~ ., data = Data1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -504165 -134780  -21179   82788 2129643 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.847e+06  8.716e+05   6.709 5.39e-11 ***
## x1           2.234e+04  1.457e+04   1.534  0.12576    
## x2          -3.075e+03  2.548e+04  -0.121  0.90398    
## x3           2.226e+05  1.197e+04  18.602  < 2e-16 ***
## x4          -3.615e+03  4.597e+02  -7.864 2.36e-14 ***
## x5           1.391e+00  4.664e-01   2.981  0.00301 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 248500 on 494 degrees of freedom
## Multiple R-squared:  0.5015, Adjusted R-squared:  0.4964 
## F-statistic: 99.38 on 5 and 494 DF,  p-value: < 2.2e-16

5.0 Hypothesis Testing

5.1 Hypothesis Testing for Initial Model

summary(Model)
## 
## Call:
## lm(formula = Data1$y ~ ., data = Data1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -504165 -134780  -21179   82788 2129643 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.847e+06  8.716e+05   6.709 5.39e-11 ***
## x1           2.234e+04  1.457e+04   1.534  0.12576    
## x2          -3.075e+03  2.548e+04  -0.121  0.90398    
## x3           2.226e+05  1.197e+04  18.602  < 2e-16 ***
## x4          -3.615e+03  4.597e+02  -7.864 2.36e-14 ***
## x5           1.391e+00  4.664e-01   2.981  0.00301 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 248500 on 494 degrees of freedom
## Multiple R-squared:  0.5015, Adjusted R-squared:  0.4964 
## F-statistic: 99.38 on 5 and 494 DF,  p-value: < 2.2e-16
anova(Model)
Fvalues <- anova(Model)["F value"] #Extract all F values
Fvalues_overall <- mean(Fvalues$`F value`, na.rm=TRUE)
Fvalues_overall
## [1] 99.38197
alpha = 0.05
qf(0.975,5,494)
## [1] 2.592178
qt(0.975,494)
## [1] 1.964778
confint(Model, level=0.95)
##                     2.5 %        97.5 %
## (Intercept)  4.134669e+06  7.559588e+06
## x1          -6.281811e+03  5.097053e+04
## x2          -5.313490e+04  4.698437e+04
## x3           1.991303e+05  2.461632e+05
## x4          -4.517877e+03 -2.711561e+03
## x5           4.741806e-01  2.306906e+00
### We can see that since the x1 and x2 has absolute p values > 0.05 and absolute t values < 1.965. Hence, this shows that these 2 regressors does not contribute significantly to the model. At the same time, their confidence intervals include 0, which means that there is a possibility that the the coefficient of x1 and x2 will be equal to zero. Hence, it has the possibility that there is no linear relationship between y and x1, x2,  which is also shown in by their statistical values that they do not contribute significantly to the model.

5.2 Hypothesis Testing for Model without x1

Model_1 <- lm(y ~ .-x1, data = Data1)
summary(Model_1)
## 
## Call:
## lm(formula = y ~ . - x1, data = Data1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -526676 -135274  -25473   86866 2129506 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.903e+06  8.720e+05   6.770 3.66e-11 ***
## x2          -1.571e+03  2.549e+04  -0.062  0.95089    
## x3           2.293e+05  1.117e+04  20.531  < 2e-16 ***
## x4          -3.632e+03  4.602e+02  -7.893 1.91e-14 ***
## x5           1.378e+00  4.670e-01   2.952  0.00331 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 248900 on 495 degrees of freedom
## Multiple R-squared:  0.4991, Adjusted R-squared:  0.495 
## F-statistic: 123.3 on 4 and 495 DF,  p-value: < 2.2e-16
anova(Model_1)
alpha = 0.05
qt(0.975,495)
## [1] 1.964768
confint(Model_1, level=0.95)
##                     2.5 %        97.5 %
## (Intercept)  4.190157e+06  7.616690e+06
## x2          -5.166140e+04  4.851968e+04
## x3           2.073616e+05  2.512501e+05
## x4          -4.536134e+03 -2.727907e+03
## x5           4.608418e-01  2.295791e+00
### Solely removing regressor x1 doesn't help to improve the R square values for the model, nor did it affect the significance of x2. Hence, it is very clear that x1 is not significant to the model.

5.3 Hypothesis Testing for Model without x2

Model_2 <- lm(y ~ .-x2, data = Data1)
summary(Model_2)
## 
## Call:
## lm(formula = y ~ . - x2, data = Data1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -505167 -134723  -21478   83919 2132488 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.876e+06  8.375e+05   7.016 7.52e-12 ***
## x1           2.228e+04  1.454e+04   1.532  0.12625    
## x3           2.222e+05  1.148e+04  19.361  < 2e-16 ***
## x4          -3.630e+03  4.416e+02  -8.220 1.79e-15 ***
## x5           1.395e+00  4.645e-01   3.003  0.00281 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 248300 on 495 degrees of freedom
## Multiple R-squared:  0.5015, Adjusted R-squared:  0.4974 
## F-statistic: 124.5 on 4 and 495 DF,  p-value: < 2.2e-16
anova(Model_2)
alpha = 0.05
qt(0.975,495)
## [1] 1.964768
confint(Model_2, level=0.95)
##                     2.5 %        97.5 %
## (Intercept)  4.230531e+06  7.521323e+06
## x1          -6.299664e+03  5.085297e+04
## x3           1.996886e+05  2.447961e+05
## x4          -4.497546e+03 -2.762352e+03
## x5           4.822982e-01  2.307588e+00
### Solely removing regressor x2 also doesn't help to improve the R square values for the model, nor did it affect the significance of x1. Hence, it is very clear that x1 is not significant to the model.

5.4 Hypothesis Testing for Model without x1 and x2

Model_6 <- lm(y ~ .-x1-x2, data = Data1)
summary(Model_6)
## 
## Call:
## lm(formula = y ~ . - x1 - x2, data = Data1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -527002 -134745  -25123   85725 2130961 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.918e+06  8.381e+05   7.061 5.61e-12 ***
## x3           2.291e+05  1.059e+04  21.638  < 2e-16 ***
## x4          -3.640e+03  4.421e+02  -8.232 1.64e-15 ***
## x5           1.381e+00  4.650e-01   2.969  0.00313 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 248600 on 496 degrees of freedom
## Multiple R-squared:  0.4991, Adjusted R-squared:  0.4961 
## F-statistic: 164.7 on 3 and 496 DF,  p-value: < 2.2e-16
anova(Model_6)
alpha = 0.05
qt(0.975,496)
## [1] 1.964758
confint(Model_6, level=0.95)
##                     2.5 %        97.5 %
## (Intercept)  4.271339e+06  7.564798e+06
## x3           2.082870e+05  2.498903e+05
## x4          -4.508462e+03 -2.771107e+03
## x5           4.668944e-01  2.294278e+00
### Removing  both regressor x1 and x2 shoes no sign of improvement for the R square values for the model, nor did they affect the significance of the other regressors. Hence, it concludes that they are not significant and shows no dependency and relationship woth the other regressors.

5.5 Hypothesis Testing for Model without x3

Model_3 <- lm(y ~ .-x3, data = Data1)
summary(Model_3)
## 
## Call:
## lm(formula = y ~ . - x3, data = Data1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -734358 -178316  -61846  110045 2559641 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.077e+06  1.104e+06   1.881 0.060529 .  
## x1           1.207e+05  1.769e+04   6.822 2.62e-11 ***
## x2           1.296e+05  3.186e+04   4.067 5.54e-05 ***
## x4          -1.106e+03  5.725e+02  -1.931 0.054007 .  
## x5           2.264e+00  6.045e-01   3.745 0.000201 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 323800 on 495 degrees of freedom
## Multiple R-squared:  0.1523, Adjusted R-squared:  0.1454 
## F-statistic: 22.23 on 4 and 495 DF,  p-value: < 2.2e-16
anova(Model_3)
alpha = 0.05
qt(0.975,495)
## [1] 1.964768
confint(Model_3)
##                     2.5 %       97.5 %
## (Intercept) -92262.898941 4.247087e+06
## x1           85914.911518 1.554164e+05
## x2           66986.471431 1.921975e+05
## x4           -2230.378980 1.911714e+01
## x5               1.076281 3.451635e+00
### The removal of x3 clearly shoes a drop in R Square value of the model and it also affected the significance of the regressors x1, x2 and x4. Hence, this shows the significance role of x3 and that multicollinearlity exist as its removal affects the significance of other regressors. 

5.6 Hypothesis Testing for Model without x4

Model_4 <- lm(y ~ .-x4, data = Data1)
summary(Model_4)
## 
## Call:
## lm(formula = y ~ . - x4, data = Data1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -492915 -143585  -32307   80231 2049566 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -9.812e+05  7.948e+04 -12.346   <2e-16 ***
## x1           2.516e+04  1.543e+04   1.630   0.1038    
## x2          -5.807e+04  2.596e+04  -2.237   0.0257 *  
## x3           1.950e+05  1.212e+04  16.085   <2e-16 ***
## x5           9.655e-01  4.909e-01   1.967   0.0498 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 263400 on 495 degrees of freedom
## Multiple R-squared:  0.4391, Adjusted R-squared:  0.4345 
## F-statistic: 96.86 on 4 and 495 DF,  p-value: < 2.2e-16
anova(Model_4)
alpha = 0.05
qt(0.975,495)
## [1] 1.964768
confint(Model_4, level=0.95)
##                     2.5 %        97.5 %
## (Intercept) -1.137406e+06 -8.250805e+05
## x1          -5.168841e+03  5.548124e+04
## x2          -1.090802e+05 -7.062382e+03
## x3           1.712062e+05  2.188514e+05
## x5           1.049889e-03  1.930043e+00
### Unlike x3, its removal only shows a moderate drop in R Square value of the model . However, it affected the significance of the regressors x2 in terms as shown in the CI. Hence, this shows a significance role of x4 and that multicollinearlity exist as its removal affects the significance of other regressors. 

5.7 Hypothesis Testing for Model without x5

Model_5 <- lm(y ~ .-x5, data = Data1)
summary(Model_5)
## 
## Call:
## lm(formula = y ~ . - x5, data = Data1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -482173 -135015  -21305   81055 2121698 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5537237.1   872228.3   6.348 4.93e-10 ***
## x1            21601.8    14683.2   1.471    0.142    
## x2            -9012.6    25602.2  -0.352    0.725    
## x3           226239.3    12002.7  18.849  < 2e-16 ***
## x4            -3455.9      460.2  -7.510 2.79e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 250500 on 495 degrees of freedom
## Multiple R-squared:  0.4925, Adjusted R-squared:  0.4884 
## F-statistic: 120.1 on 4 and 495 DF,  p-value: < 2.2e-16
anova(Model_5)
alpha = 0.05
qt(0.975,495)
## [1] 1.964768
confint(Model_5, level=0.95)
##                   2.5 %      97.5 %
## (Intercept) 3823510.913 7250963.280
## x1            -7247.210   50450.845
## x2           -59314.944   41289.797
## x3           202656.668  249821.866
## x4            -4360.095   -2551.716
### The removal of it doesnt show any effect towards the R Square value or the significance of other residuals. However, according to the global test in section 5.1, it is significant as its value is smaller than 0.05. However, as the difference is extreely minor, it also means that the significance of x5 is necessary but minor compared to x3 and x4.

6.0 Final and Best Fitted Model

# Definition of best fitted model
FinalModel<- lm(Data1$y ~ .-x1-x2, data=Data1)
summary(FinalModel)
## 
## Call:
## lm(formula = Data1$y ~ . - x1 - x2, data = Data1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -527002 -134745  -25123   85725 2130961 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.918e+06  8.381e+05   7.061 5.61e-12 ***
## x3           2.291e+05  1.059e+04  21.638  < 2e-16 ***
## x4          -3.640e+03  4.421e+02  -8.232 1.64e-15 ***
## x5           1.381e+00  4.650e-01   2.969  0.00313 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 248600 on 496 degrees of freedom
## Multiple R-squared:  0.4991, Adjusted R-squared:  0.4961 
## F-statistic: 164.7 on 3 and 496 DF,  p-value: < 2.2e-16
Data2 <- subset(Data1,select = -c(x1,x2))
view(Data2)

Data3 <- subset(Data1,select = -c(x1,x2))
Data3$logy <- log(Data3$y)
Data4 <- subset(Data3,select = -c(y))
view(Data4)
Model_Transform1 <-lm(Data4$logy ~ ., data=Data4)
summary(Model_Transform1)
## 
## Call:
## lm(formula = Data4$logy ~ ., data = Data4)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.91954 -0.24790  0.00964  0.22545  1.35007 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.119e+01  1.199e+00  17.672  < 2e-16 ***
## x3           3.618e-01  1.515e-02  23.884  < 2e-16 ***
## x4          -5.569e-03  6.325e-04  -8.805  < 2e-16 ***
## x5           1.811e-06  6.653e-07   2.721  0.00673 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3557 on 496 degrees of freedom
## Multiple R-squared:  0.5465, Adjusted R-squared:  0.5437 
## F-statistic: 199.2 on 3 and 496 DF,  p-value: < 2.2e-16
## Determine the fitted model of the Transformed model
Data4$transform_fitted<-fitted(Model_Transform1)


# Show model graphically with the fitted model
Data2$fitted <- fitted(FinalModel)

plot(1:500,
     Data1$y,
     main="Scatterplot of Observed Values and Fitted Values", 
     xlab="Datapoint",ylab="House Price", 
     pch=21, 
     col="blue",
     bg="blue")
points(1:500,
       Data2$fitted,
       pch=21,
       col="red",
       bg="red")
legend("topleft",
       c("Observed Values","Fitted Values"),
       fill=c("blue","red"),
       cex=0.75,
       box.lwd = 1)

6.1 Correlation and Multicollinearlity

## Multicollinearlity
### Plot scatterplot and correlation matrix for Final Model
pairs.panels(Data2, 
             method = "pearson", 
             main = "Scatterplot and Correlation Matrix")

### Calculate Variance Infaltion Factors (VIF)
vif(FinalModel)
##       x3       x4       x5 
## 1.249537 1.253609 1.029234

7.0 Residuals

7.1 Error Normality

#Create a new column for residuals (ei)
Data2$ei <-resid(FinalModel)

#Create a new column for fitted values (y hat)
Data2$fitted <-predict(FinalModel)

#Calculate Standardized Residuals (di)
mse <-get_mse(FinalModel)
Data2$di <-Data2$ei/sqrt(mse)

#Calculate Studentized Residuals 
Data2$ri <- rstandard(FinalModel)

#Calculate R-Student Residuals
Data2$ti <- rstudent(FinalModel)

View(Data2)

7.2 Residual Plots

qqplot.das(Data2$ei, distribution = "norm", ylab = "Sample Quantiles",
           xlab = "Theoretical Quantiles", main = "Normal Q-Q Plot for Ordinary Residuals", las = par("las"),
           datax = FALSE, envelope = 0.95, labels = FALSE, col = "#1F3552",
           lwd = 2, pch = 20,line = c("quartiles","robust","none"), cex = 1,
           xaxt = "s", add.plot = FALSE, xlim = NULL, ylim = NULL)

qqplot.das(Data2$ri, distribution = "norm", ylab = "sample Quantiles",
           xlab = "Theoretical Quantiles", main = "Normal Q-Q Plot for R-Student Residuals", las = par("las"),
           datax = FALSE, envelope = 0.95, labels = FALSE, col = "#1F3552",
           lwd = 2, pch = 20,line = c("quartiles","robust","none"), cex = 1,
           xaxt = "s", add.plot = FALSE, xlim = NULL, ylim = NULL)

7.3 Independence of Error and Homogeneity of Variance of Error

#Plots of residuals against fitted values 
ggplot(FinalModel, aes(x = Data2$fitted, y = Data2$ti)) + geom_point(color ="#1F3552")

ggplot(FinalModel, aes(x = Data2$fitted, y = Data2$ei)) + geom_point(color ="#1F3552")

#Plots of residuals against regressor
ggplot(FinalModel, aes(x = Data2$x3, y = Data2$ti)) + geom_point(color ="#1F3552")

ggplot(FinalModel, aes(x = Data2$x4, y = Data2$ti)) + geom_point(color ="#1F3552")

ggplot(FinalModel, aes(x = Data2$x5, y = Data2$ti)) + geom_point(color ="#1F3552")

7.4 Outliers

### Plot Final_Model
par(mfrow=c(1,2))
plot(FinalModel)

### Determine outliers
Observation <- c(1:nrow(Data2))
di <- data.frame(Observation,Data2$di)
outlier_di <- di[abs(di$Data2.di) > 2,]
outlier_di
### Form a New Table for Diagnostic of Leverage and Influence Points where hii = diagonal values, DFFITS = DFFITS and CookD = Cook's Distance
hii <- hatvalues(FinalModel)
CooksD <- cooks.distance(FinalModel)
DFFITS <- dffits(FinalModel)
Comparison <- data.frame(hii,DFFITS,CooksD)

k = 3 #No. of Regressors
n = 500 #No. of Observations
p = k + 1 

### Standard Values

Lev_Std <- 2*p/n
Lev_Std
## [1] 0.016
CooksD_std<- 1
CooksD_std
## [1] 1
DFFITS_Std <- 2*sqrt(p/n)
DFFITS_Std
## [1] 0.1788854
### Check on leverage points 
lev_check <- Comparison[Comparison$hii > Lev_Std,]
lev_check[1]
nrow(lev_check)
## [1] 36
### Check on number of influential points (DFFITS)
dffits_check <- Comparison[Comparison$DFFITS > DFFITS_Std,]
dffits_check [2]
nrow(dffits_check)
## [1] 14
### Check on number of influential points (CooksD)
CooksD_check <- Comparison[Comparison$CooksD > CooksD_std,]
CooksD_check [3]
nrow(CooksD_check)
## [1] 0

#8.0 Conclusion

## Transform regressor y to logy
Data2$logy<-log(Data2$y)
Data3 <- subset(Data2, select=c(logy, x3, x4, x5))

## Fit a lm model using logy #To reduce power ladder
Model_Transform <-lm(Data3$logy ~ ., data=Data3)
summary(Model_Transform)
## 
## Call:
## lm(formula = Data3$logy ~ ., data = Data3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.91954 -0.24790  0.00964  0.22545  1.35007 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.119e+01  1.199e+00  17.672  < 2e-16 ***
## x3           3.618e-01  1.515e-02  23.884  < 2e-16 ***
## x4          -5.569e-03  6.325e-04  -8.805  < 2e-16 ***
## x5           1.811e-06  6.653e-07   2.721  0.00673 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3557 on 496 degrees of freedom
## Multiple R-squared:  0.5465, Adjusted R-squared:  0.5437 
## F-statistic: 199.2 on 3 and 496 DF,  p-value: < 2.2e-16
Data2$fitted <- (Data1$y)

## Determine the fitted model of the Transformed model
Data2$transform_fitted<-fitted(Model_Transform)

## Obtain ordinary residual and R-Student residual of the Transformed Model
Data2$transform_ei <- Data2$logy - Data2$transform_fitted
Data2$transform_ti <- rstudent(Model_Transform)
view(Data2)

## Residual Plot of Ordinary/Standardized Residual against the Predicted Response of the Transformed Model
par(mfrow=c(1,1))
ggplot(Model_Transform, aes(x = Data2$fitted, y = Data2$transform_ti) ) + geom_point(color ="#1F3552") + ggtitle("Plot of R-Student Residuals against the Predicted Response of the Transformed Model")

ggplot(Model_Transform, aes(x = Data2$fitted, y = Data2$transform_ei)) + geom_point(color ="#1F3552") + ggtitle("Plot of Ordinary Residuals against the Predicted Response of the Transformed Model")

hist(Data2$transform_ei)

hist(Data2$transform_ti)

#Shows randomly scattered with mean 0 and no onvious pattern hence a constant variance. Can be ssen in both histogram and residual plot

## Normal Q-Q Plot of Ordinary/Standardized Residual of the Transformed Model
qqplot.das(Data2$transform_ei, distribution = "norm", ylab = "Sample Quantiles",
           xlab = "Theoretical Quantiles", main = "Normal Q-Q plot of Ordinary Residual of the Transformed Model", las = par("las"),
           datax = FALSE, envelope = 0.95, labels = FALSE, col = "#1F3552",
           lwd = 2, pch = 20,line = c("quartiles","robust","none"), cex = 1,
           xaxt = "s", add.plot = FALSE, xlim = NULL, ylim = NULL)

qqplot.das(Data2$transform_ti, distribution = "norm", ylab = "Sample Quantiles",
           xlab = "Theoretical Quantiles", main = "Normal Q-Q plot of R-Student Residual of the Transformed Model", las = par("las"),
           datax = FALSE, envelope = 0.95, labels = FALSE, col = "#1F3552",
           lwd = 2, pch = 20,line = c("quartiles","robust","none"), cex = 1,
           xaxt = "s", add.plot = FALSE, xlim = NULL, ylim = NULL)

#Scatterplot of Observed Values and Fitted Values
plot(1:500,
     Data2$logy,
     main="Scatterplot of Observed Values and Fitted Values", 
     xlab="Datapoint",ylab="House Price", 
     pch=21, 
     col="blue",
     bg="blue")
points(1:500,
       Data2$transform_fitted,
       pch=21,
       col="red",
       bg="red")
legend("topleft",
       c("Observed Values","Fitted Values"),
       fill=c("blue","red"),
       cex=0.75,
       box.lwd = 1)

plot(1:500,
     Data2$logy,
     main="Scatterplot of Observed Values and Fitted Values", 
     xlab="Datapoint",ylab="House Price", 
     pch=21, 
     col="blue",
     bg="blue")
points(1:500,
       Data2$transform_fitted,
       pch=21,
       col="red",
       bg="red")
legend("topleft",
       c("Observed Values","Fitted Values"),
       fill=c("blue","red"),
       cex=0.75,
       box.lwd = 1)