library(readxl) # To read files from Microsoft Excel
library(fitdistrplus) # To plot histogram
## Warning: package 'fitdistrplus' was built under R version 4.3.3
## Loading required package: MASS
## Loading required package: survival
library(psych) # To plot scatterplot and correlation matrix
library(tidyverse) # For data manipulation and visualization
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'forcats' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%() masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stats4) # To perform maximum likelihood estimation
library(gmm) # To perform generalised method of moment
## Warning: package 'gmm' was built under R version 4.3.3
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.3.3
library(dvmisc) # To obtain mean squared error
## Warning: package 'dvmisc' was built under R version 4.3.3
## Loading required package: rbenchmark
##
## Attaching package: 'dvmisc'
##
## The following object is masked from 'package:tidyr':
##
## expand_grid
##
## The following object is masked from 'package:psych':
##
## headtail
library(corrplot) # Visualize the correlation matrix
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.92 loaded
library(car) # To print VIF
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
## The following object is masked from 'package:psych':
##
## logit
library(StatDA) # To generate qq-plot
## Warning: package 'StatDA' was built under R version 4.3.3
## Loading required package: sgeostat
## Registered S3 method overwritten by 'geoR':
## method from
## plot.variogram sgeostat
library(ggplot2)
## Import dataset
Sampledata <- read_excel("Assignment dataset.xlsx")
View(Sampledata)
## Check the structure of the Sampledata
str(Sampledata)
## tibble [500 × 7] (S3: tbl_df/tbl/data.frame)
## $ House Price : num [1:500] 221900 538000 180000 604000 510000 ...
## $ x : num [1:500] 1 2 3 4 5 6 7 8 9 10 ...
## $ Number of Bedrooms: num [1:500] 3 3 2 4 3 4 3 3 3 3 ...
## $ Number of Floors : num [1:500] 1 2 1 1 1 1 2 1 1 2 ...
## $ Grade : num [1:500] 7 7 6 7 8 11 7 7 7 7 ...
## $ Year of Completion: num [1:500] 1955 1951 1933 1965 1987 ...
## $ Lot Size : num [1:500] 5650 7639 8062 5000 7503 ...
## Remove the unnecessary column
Data1 <- subset(Sampledata,select = -c(x))
View(Data1)
### Change the column names of Data1
## y = House price
## x1 = Number of bedrooms
## x2 = Number of floors
## x3 = Grade
## x4 = Year of completion
## x5 = Lot size
colnames(Data1) <- c("y","x1","x2","x3","x4","x5")
##Plot boxplot for Number of Bedroom (x1)
boxplot(Data1$x1,
col = "#1F3552",
main = "Boxplot of Number of Bedroom (x1)",
xlab = "Number of Bedroom (x1)",
horizontal = TRUE)
print(summary(Data1$x1), digits = 10)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 3.000 3.000 3.366 4.000 6.000
##Plot boxplot for Number of Floors (x2)
boxplot(Data1$x2,
col = "#1F3552",
main = "Boxplot of Number of Floors (x2)",
xlab = "Number of Floors (x2)",
horizontal = TRUE)
print(summary(Data1$x2), digits = 10)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.000 1.445 2.000 3.000
##Plot boxplot for Grade (x3)
boxplot(Data1$x3,
col = "#1F3552",
main = "Boxplot of Grade (x3)",
xlab = "Grade (x3)",
horizontal = TRUE)
print(summary(Data1$x3), digits = 10)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.000 7.000 7.000 7.654 8.000 12.000
##Plot boxplot for Year of Completion (x4)
boxplot(Data1$x4,
col = "#1F3552",
main = "Boxplot of Year of Completion (x4)",
xlab = "Year of Completion (x4)",
horizontal = TRUE)
print(summary(Data1$x4), digits = 10)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1900.000 1953.000 1972.000 1968.512 1991.000 2014.000
##Plot boxplot for Lot Size (x5)
boxplot(Data1$x5,
col = "#1F3552",
main = "Boxplot of Lot Size (x5)",
xlab = "Lot Size (x5)",
horizontal = TRUE)
print(summary(Data1$x5), digits = 10)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1106.00 5569.00 8130.50 13774.21 11014.00 211267.00
### Plot scatterplot of House Price (y)
plot(1:500,Data1$y,
main = "Scatterplot of House Price (y)",
xlab = "Datapoint",
ylab = "House Price (y)",
pch = 21,
col = "blue",
bg = "blue")
### Plot histogram of House Price (y)
fn<-fitdist(Data1$y,"norm")
plot.legend<-c("Normal")
denscomp(list(fn),
legendtext = plot.legend,
fitcol = "red",
datacol = "#1F3552",
main = "Histogram of House Price (y)",
ylab = "Datapoint",
xlab = "House Price (y)")
### Plot scatterplot matrix
pairs(Data1,
upper.panel = NULL,
cex.labels = 1.25,
main = "Scatterplot Matrix")
### Plot scatterplot and correlation matrix
pairs.panels(Data1,
method = "pearson",
main = "Scatterplot and Correlation Matrix")
## Estimation of parameters (mu and sigma) using Maximum Likelihood Estimation (MLE)
### Obtain the likelihood function
likelihood_function <-function(mu,sigma){
-sum(dnorm(Data1$y,
mean=mu,
sd=sigma,
log=TRUE))
}
### Determine the paramter using mle function
normal_fit<-mle(likelihood_function,
start=list(mu=0,sigma=1),
method='L-BFGS-B')
summary(normal_fit)
## Maximum likelihood estimation
##
## Call:
## mle(minuslogl = likelihood_function, start = list(mu = 0, sigma = 1),
## method = "L-BFGS-B")
##
## Coefficients:
## Estimate Std. Error
## mu 525570.2 2097.152
## sigma 349905.6 1482.910
##
## -2 log L: 14184.34
## Estimation of parameters/Coefficients of the Multiple Regression Model,(Bi,i=1,2,3,4,5)
Model<-lm(Data1$y ~ ., data=Data1)
summary(Model)
##
## Call:
## lm(formula = Data1$y ~ ., data = Data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -504165 -134780 -21179 82788 2129643
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.847e+06 8.716e+05 6.709 5.39e-11 ***
## x1 2.234e+04 1.457e+04 1.534 0.12576
## x2 -3.075e+03 2.548e+04 -0.121 0.90398
## x3 2.226e+05 1.197e+04 18.602 < 2e-16 ***
## x4 -3.615e+03 4.597e+02 -7.864 2.36e-14 ***
## x5 1.391e+00 4.664e-01 2.981 0.00301 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 248500 on 494 degrees of freedom
## Multiple R-squared: 0.5015, Adjusted R-squared: 0.4964
## F-statistic: 99.38 on 5 and 494 DF, p-value: < 2.2e-16
summary(Model)
##
## Call:
## lm(formula = Data1$y ~ ., data = Data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -504165 -134780 -21179 82788 2129643
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.847e+06 8.716e+05 6.709 5.39e-11 ***
## x1 2.234e+04 1.457e+04 1.534 0.12576
## x2 -3.075e+03 2.548e+04 -0.121 0.90398
## x3 2.226e+05 1.197e+04 18.602 < 2e-16 ***
## x4 -3.615e+03 4.597e+02 -7.864 2.36e-14 ***
## x5 1.391e+00 4.664e-01 2.981 0.00301 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 248500 on 494 degrees of freedom
## Multiple R-squared: 0.5015, Adjusted R-squared: 0.4964
## F-statistic: 99.38 on 5 and 494 DF, p-value: < 2.2e-16
anova(Model)
Fvalues <- anova(Model)["F value"] #Extract all F values
Fvalues_overall <- mean(Fvalues$`F value`, na.rm=TRUE)
Fvalues_overall
## [1] 99.38197
alpha = 0.05
qf(0.975,5,494)
## [1] 2.592178
qt(0.975,494)
## [1] 1.964778
confint(Model, level=0.95)
## 2.5 % 97.5 %
## (Intercept) 4.134669e+06 7.559588e+06
## x1 -6.281811e+03 5.097053e+04
## x2 -5.313490e+04 4.698437e+04
## x3 1.991303e+05 2.461632e+05
## x4 -4.517877e+03 -2.711561e+03
## x5 4.741806e-01 2.306906e+00
### We can see that since the x1 and x2 has absolute p values > 0.05 and absolute t values < 1.965. Hence, this shows that these 2 regressors does not contribute significantly to the model. At the same time, their confidence intervals include 0, which means that there is a possibility that the the coefficient of x1 and x2 will be equal to zero. Hence, it has the possibility that there is no linear relationship between y and x1, x2, which is also shown in by their statistical values that they do not contribute significantly to the model.
Model_1 <- lm(y ~ .-x1, data = Data1)
summary(Model_1)
##
## Call:
## lm(formula = y ~ . - x1, data = Data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -526676 -135274 -25473 86866 2129506
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.903e+06 8.720e+05 6.770 3.66e-11 ***
## x2 -1.571e+03 2.549e+04 -0.062 0.95089
## x3 2.293e+05 1.117e+04 20.531 < 2e-16 ***
## x4 -3.632e+03 4.602e+02 -7.893 1.91e-14 ***
## x5 1.378e+00 4.670e-01 2.952 0.00331 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 248900 on 495 degrees of freedom
## Multiple R-squared: 0.4991, Adjusted R-squared: 0.495
## F-statistic: 123.3 on 4 and 495 DF, p-value: < 2.2e-16
anova(Model_1)
alpha = 0.05
qt(0.975,495)
## [1] 1.964768
confint(Model_1, level=0.95)
## 2.5 % 97.5 %
## (Intercept) 4.190157e+06 7.616690e+06
## x2 -5.166140e+04 4.851968e+04
## x3 2.073616e+05 2.512501e+05
## x4 -4.536134e+03 -2.727907e+03
## x5 4.608418e-01 2.295791e+00
### Solely removing regressor x1 doesn't help to improve the R square values for the model, nor did it affect the significance of x2. Hence, it is very clear that x1 is not significant to the model.
Model_2 <- lm(y ~ .-x2, data = Data1)
summary(Model_2)
##
## Call:
## lm(formula = y ~ . - x2, data = Data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -505167 -134723 -21478 83919 2132488
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.876e+06 8.375e+05 7.016 7.52e-12 ***
## x1 2.228e+04 1.454e+04 1.532 0.12625
## x3 2.222e+05 1.148e+04 19.361 < 2e-16 ***
## x4 -3.630e+03 4.416e+02 -8.220 1.79e-15 ***
## x5 1.395e+00 4.645e-01 3.003 0.00281 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 248300 on 495 degrees of freedom
## Multiple R-squared: 0.5015, Adjusted R-squared: 0.4974
## F-statistic: 124.5 on 4 and 495 DF, p-value: < 2.2e-16
anova(Model_2)
alpha = 0.05
qt(0.975,495)
## [1] 1.964768
confint(Model_2, level=0.95)
## 2.5 % 97.5 %
## (Intercept) 4.230531e+06 7.521323e+06
## x1 -6.299664e+03 5.085297e+04
## x3 1.996886e+05 2.447961e+05
## x4 -4.497546e+03 -2.762352e+03
## x5 4.822982e-01 2.307588e+00
### Solely removing regressor x2 also doesn't help to improve the R square values for the model, nor did it affect the significance of x1. Hence, it is very clear that x1 is not significant to the model.
Model_6 <- lm(y ~ .-x1-x2, data = Data1)
summary(Model_6)
##
## Call:
## lm(formula = y ~ . - x1 - x2, data = Data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -527002 -134745 -25123 85725 2130961
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.918e+06 8.381e+05 7.061 5.61e-12 ***
## x3 2.291e+05 1.059e+04 21.638 < 2e-16 ***
## x4 -3.640e+03 4.421e+02 -8.232 1.64e-15 ***
## x5 1.381e+00 4.650e-01 2.969 0.00313 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 248600 on 496 degrees of freedom
## Multiple R-squared: 0.4991, Adjusted R-squared: 0.4961
## F-statistic: 164.7 on 3 and 496 DF, p-value: < 2.2e-16
anova(Model_6)
alpha = 0.05
qt(0.975,496)
## [1] 1.964758
confint(Model_6, level=0.95)
## 2.5 % 97.5 %
## (Intercept) 4.271339e+06 7.564798e+06
## x3 2.082870e+05 2.498903e+05
## x4 -4.508462e+03 -2.771107e+03
## x5 4.668944e-01 2.294278e+00
### Removing both regressor x1 and x2 shoes no sign of improvement for the R square values for the model, nor did they affect the significance of the other regressors. Hence, it concludes that they are not significant and shows no dependency and relationship woth the other regressors.
Model_3 <- lm(y ~ .-x3, data = Data1)
summary(Model_3)
##
## Call:
## lm(formula = y ~ . - x3, data = Data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -734358 -178316 -61846 110045 2559641
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.077e+06 1.104e+06 1.881 0.060529 .
## x1 1.207e+05 1.769e+04 6.822 2.62e-11 ***
## x2 1.296e+05 3.186e+04 4.067 5.54e-05 ***
## x4 -1.106e+03 5.725e+02 -1.931 0.054007 .
## x5 2.264e+00 6.045e-01 3.745 0.000201 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 323800 on 495 degrees of freedom
## Multiple R-squared: 0.1523, Adjusted R-squared: 0.1454
## F-statistic: 22.23 on 4 and 495 DF, p-value: < 2.2e-16
anova(Model_3)
alpha = 0.05
qt(0.975,495)
## [1] 1.964768
confint(Model_3)
## 2.5 % 97.5 %
## (Intercept) -92262.898941 4.247087e+06
## x1 85914.911518 1.554164e+05
## x2 66986.471431 1.921975e+05
## x4 -2230.378980 1.911714e+01
## x5 1.076281 3.451635e+00
### The removal of x3 clearly shoes a drop in R Square value of the model and it also affected the significance of the regressors x1, x2 and x4. Hence, this shows the significance role of x3 and that multicollinearlity exist as its removal affects the significance of other regressors.
Model_4 <- lm(y ~ .-x4, data = Data1)
summary(Model_4)
##
## Call:
## lm(formula = y ~ . - x4, data = Data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -492915 -143585 -32307 80231 2049566
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.812e+05 7.948e+04 -12.346 <2e-16 ***
## x1 2.516e+04 1.543e+04 1.630 0.1038
## x2 -5.807e+04 2.596e+04 -2.237 0.0257 *
## x3 1.950e+05 1.212e+04 16.085 <2e-16 ***
## x5 9.655e-01 4.909e-01 1.967 0.0498 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 263400 on 495 degrees of freedom
## Multiple R-squared: 0.4391, Adjusted R-squared: 0.4345
## F-statistic: 96.86 on 4 and 495 DF, p-value: < 2.2e-16
anova(Model_4)
alpha = 0.05
qt(0.975,495)
## [1] 1.964768
confint(Model_4, level=0.95)
## 2.5 % 97.5 %
## (Intercept) -1.137406e+06 -8.250805e+05
## x1 -5.168841e+03 5.548124e+04
## x2 -1.090802e+05 -7.062382e+03
## x3 1.712062e+05 2.188514e+05
## x5 1.049889e-03 1.930043e+00
### Unlike x3, its removal only shows a moderate drop in R Square value of the model . However, it affected the significance of the regressors x2 in terms as shown in the CI. Hence, this shows a significance role of x4 and that multicollinearlity exist as its removal affects the significance of other regressors.
Model_5 <- lm(y ~ .-x5, data = Data1)
summary(Model_5)
##
## Call:
## lm(formula = y ~ . - x5, data = Data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -482173 -135015 -21305 81055 2121698
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5537237.1 872228.3 6.348 4.93e-10 ***
## x1 21601.8 14683.2 1.471 0.142
## x2 -9012.6 25602.2 -0.352 0.725
## x3 226239.3 12002.7 18.849 < 2e-16 ***
## x4 -3455.9 460.2 -7.510 2.79e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 250500 on 495 degrees of freedom
## Multiple R-squared: 0.4925, Adjusted R-squared: 0.4884
## F-statistic: 120.1 on 4 and 495 DF, p-value: < 2.2e-16
anova(Model_5)
alpha = 0.05
qt(0.975,495)
## [1] 1.964768
confint(Model_5, level=0.95)
## 2.5 % 97.5 %
## (Intercept) 3823510.913 7250963.280
## x1 -7247.210 50450.845
## x2 -59314.944 41289.797
## x3 202656.668 249821.866
## x4 -4360.095 -2551.716
### The removal of it doesnt show any effect towards the R Square value or the significance of other residuals. However, according to the global test in section 5.1, it is significant as its value is smaller than 0.05. However, as the difference is extreely minor, it also means that the significance of x5 is necessary but minor compared to x3 and x4.
# Definition of best fitted model
FinalModel<- lm(Data1$y ~ .-x1-x2, data=Data1)
summary(FinalModel)
##
## Call:
## lm(formula = Data1$y ~ . - x1 - x2, data = Data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -527002 -134745 -25123 85725 2130961
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.918e+06 8.381e+05 7.061 5.61e-12 ***
## x3 2.291e+05 1.059e+04 21.638 < 2e-16 ***
## x4 -3.640e+03 4.421e+02 -8.232 1.64e-15 ***
## x5 1.381e+00 4.650e-01 2.969 0.00313 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 248600 on 496 degrees of freedom
## Multiple R-squared: 0.4991, Adjusted R-squared: 0.4961
## F-statistic: 164.7 on 3 and 496 DF, p-value: < 2.2e-16
Data2 <- subset(Data1,select = -c(x1,x2))
view(Data2)
Data3 <- subset(Data1,select = -c(x1,x2))
Data3$logy <- log(Data3$y)
Data4 <- subset(Data3,select = -c(y))
view(Data4)
Model_Transform1 <-lm(Data4$logy ~ ., data=Data4)
summary(Model_Transform1)
##
## Call:
## lm(formula = Data4$logy ~ ., data = Data4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.91954 -0.24790 0.00964 0.22545 1.35007
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.119e+01 1.199e+00 17.672 < 2e-16 ***
## x3 3.618e-01 1.515e-02 23.884 < 2e-16 ***
## x4 -5.569e-03 6.325e-04 -8.805 < 2e-16 ***
## x5 1.811e-06 6.653e-07 2.721 0.00673 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3557 on 496 degrees of freedom
## Multiple R-squared: 0.5465, Adjusted R-squared: 0.5437
## F-statistic: 199.2 on 3 and 496 DF, p-value: < 2.2e-16
## Determine the fitted model of the Transformed model
Data4$transform_fitted<-fitted(Model_Transform1)
# Show model graphically with the fitted model
Data2$fitted <- fitted(FinalModel)
plot(1:500,
Data1$y,
main="Scatterplot of Observed Values and Fitted Values",
xlab="Datapoint",ylab="House Price",
pch=21,
col="blue",
bg="blue")
points(1:500,
Data2$fitted,
pch=21,
col="red",
bg="red")
legend("topleft",
c("Observed Values","Fitted Values"),
fill=c("blue","red"),
cex=0.75,
box.lwd = 1)
## Multicollinearlity
### Plot scatterplot and correlation matrix for Final Model
pairs.panels(Data2,
method = "pearson",
main = "Scatterplot and Correlation Matrix")
### Calculate Variance Infaltion Factors (VIF)
vif(FinalModel)
## x3 x4 x5
## 1.249537 1.253609 1.029234
#Create a new column for residuals (ei)
Data2$ei <-resid(FinalModel)
#Create a new column for fitted values (y hat)
Data2$fitted <-predict(FinalModel)
#Calculate Standardized Residuals (di)
mse <-get_mse(FinalModel)
Data2$di <-Data2$ei/sqrt(mse)
#Calculate Studentized Residuals
Data2$ri <- rstandard(FinalModel)
#Calculate R-Student Residuals
Data2$ti <- rstudent(FinalModel)
View(Data2)
qqplot.das(Data2$ei, distribution = "norm", ylab = "Sample Quantiles",
xlab = "Theoretical Quantiles", main = "Normal Q-Q Plot for Ordinary Residuals", las = par("las"),
datax = FALSE, envelope = 0.95, labels = FALSE, col = "#1F3552",
lwd = 2, pch = 20,line = c("quartiles","robust","none"), cex = 1,
xaxt = "s", add.plot = FALSE, xlim = NULL, ylim = NULL)
qqplot.das(Data2$ri, distribution = "norm", ylab = "sample Quantiles",
xlab = "Theoretical Quantiles", main = "Normal Q-Q Plot for R-Student Residuals", las = par("las"),
datax = FALSE, envelope = 0.95, labels = FALSE, col = "#1F3552",
lwd = 2, pch = 20,line = c("quartiles","robust","none"), cex = 1,
xaxt = "s", add.plot = FALSE, xlim = NULL, ylim = NULL)
#Plots of residuals against fitted values
ggplot(FinalModel, aes(x = Data2$fitted, y = Data2$ti)) + geom_point(color ="#1F3552")
ggplot(FinalModel, aes(x = Data2$fitted, y = Data2$ei)) + geom_point(color ="#1F3552")
#Plots of residuals against regressor
ggplot(FinalModel, aes(x = Data2$x3, y = Data2$ti)) + geom_point(color ="#1F3552")
ggplot(FinalModel, aes(x = Data2$x4, y = Data2$ti)) + geom_point(color ="#1F3552")
ggplot(FinalModel, aes(x = Data2$x5, y = Data2$ti)) + geom_point(color ="#1F3552")
### Plot Final_Model
par(mfrow=c(1,2))
plot(FinalModel)
### Determine outliers
Observation <- c(1:nrow(Data2))
di <- data.frame(Observation,Data2$di)
outlier_di <- di[abs(di$Data2.di) > 2,]
outlier_di
### Form a New Table for Diagnostic of Leverage and Influence Points where hii = diagonal values, DFFITS = DFFITS and CookD = Cook's Distance
hii <- hatvalues(FinalModel)
CooksD <- cooks.distance(FinalModel)
DFFITS <- dffits(FinalModel)
Comparison <- data.frame(hii,DFFITS,CooksD)
k = 3 #No. of Regressors
n = 500 #No. of Observations
p = k + 1
### Standard Values
Lev_Std <- 2*p/n
Lev_Std
## [1] 0.016
CooksD_std<- 1
CooksD_std
## [1] 1
DFFITS_Std <- 2*sqrt(p/n)
DFFITS_Std
## [1] 0.1788854
### Check on leverage points
lev_check <- Comparison[Comparison$hii > Lev_Std,]
lev_check[1]
nrow(lev_check)
## [1] 36
### Check on number of influential points (DFFITS)
dffits_check <- Comparison[Comparison$DFFITS > DFFITS_Std,]
dffits_check [2]
nrow(dffits_check)
## [1] 14
### Check on number of influential points (CooksD)
CooksD_check <- Comparison[Comparison$CooksD > CooksD_std,]
CooksD_check [3]
nrow(CooksD_check)
## [1] 0
#8.0 Conclusion
## Transform regressor y to logy
Data2$logy<-log(Data2$y)
Data3 <- subset(Data2, select=c(logy, x3, x4, x5))
## Fit a lm model using logy #To reduce power ladder
Model_Transform <-lm(Data3$logy ~ ., data=Data3)
summary(Model_Transform)
##
## Call:
## lm(formula = Data3$logy ~ ., data = Data3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.91954 -0.24790 0.00964 0.22545 1.35007
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.119e+01 1.199e+00 17.672 < 2e-16 ***
## x3 3.618e-01 1.515e-02 23.884 < 2e-16 ***
## x4 -5.569e-03 6.325e-04 -8.805 < 2e-16 ***
## x5 1.811e-06 6.653e-07 2.721 0.00673 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3557 on 496 degrees of freedom
## Multiple R-squared: 0.5465, Adjusted R-squared: 0.5437
## F-statistic: 199.2 on 3 and 496 DF, p-value: < 2.2e-16
Data2$fitted <- (Data1$y)
## Determine the fitted model of the Transformed model
Data2$transform_fitted<-fitted(Model_Transform)
## Obtain ordinary residual and R-Student residual of the Transformed Model
Data2$transform_ei <- Data2$logy - Data2$transform_fitted
Data2$transform_ti <- rstudent(Model_Transform)
view(Data2)
## Residual Plot of Ordinary/Standardized Residual against the Predicted Response of the Transformed Model
par(mfrow=c(1,1))
ggplot(Model_Transform, aes(x = Data2$fitted, y = Data2$transform_ti) ) + geom_point(color ="#1F3552") + ggtitle("Plot of R-Student Residuals against the Predicted Response of the Transformed Model")
ggplot(Model_Transform, aes(x = Data2$fitted, y = Data2$transform_ei)) + geom_point(color ="#1F3552") + ggtitle("Plot of Ordinary Residuals against the Predicted Response of the Transformed Model")
hist(Data2$transform_ei)
hist(Data2$transform_ti)
#Shows randomly scattered with mean 0 and no onvious pattern hence a constant variance. Can be ssen in both histogram and residual plot
## Normal Q-Q Plot of Ordinary/Standardized Residual of the Transformed Model
qqplot.das(Data2$transform_ei, distribution = "norm", ylab = "Sample Quantiles",
xlab = "Theoretical Quantiles", main = "Normal Q-Q plot of Ordinary Residual of the Transformed Model", las = par("las"),
datax = FALSE, envelope = 0.95, labels = FALSE, col = "#1F3552",
lwd = 2, pch = 20,line = c("quartiles","robust","none"), cex = 1,
xaxt = "s", add.plot = FALSE, xlim = NULL, ylim = NULL)
qqplot.das(Data2$transform_ti, distribution = "norm", ylab = "Sample Quantiles",
xlab = "Theoretical Quantiles", main = "Normal Q-Q plot of R-Student Residual of the Transformed Model", las = par("las"),
datax = FALSE, envelope = 0.95, labels = FALSE, col = "#1F3552",
lwd = 2, pch = 20,line = c("quartiles","robust","none"), cex = 1,
xaxt = "s", add.plot = FALSE, xlim = NULL, ylim = NULL)
#Scatterplot of Observed Values and Fitted Values
plot(1:500,
Data2$logy,
main="Scatterplot of Observed Values and Fitted Values",
xlab="Datapoint",ylab="House Price",
pch=21,
col="blue",
bg="blue")
points(1:500,
Data2$transform_fitted,
pch=21,
col="red",
bg="red")
legend("topleft",
c("Observed Values","Fitted Values"),
fill=c("blue","red"),
cex=0.75,
box.lwd = 1)
plot(1:500,
Data2$logy,
main="Scatterplot of Observed Values and Fitted Values",
xlab="Datapoint",ylab="House Price",
pch=21,
col="blue",
bg="blue")
points(1:500,
Data2$transform_fitted,
pch=21,
col="red",
bg="red")
legend("topleft",
c("Observed Values","Fitted Values"),
fill=c("blue","red"),
cex=0.75,
box.lwd = 1)