DAT 605 FINAL SPRING 2020

library(dplyr)
library(knitr)
library(moments)
library(kableExtra)
library(reshape2)
library(ggplot2)
library(tidyr)
library(corrplot)
library(MASS)
library(pastecs)
library(psych)
library(e1071)
library(fBasics)
library(mosaic)
library(ggplot2)
require(ggthemes)
library(plogr)
require(Matrix)
library(onehot)
library(data.table)
library(mltools)
library(car)

Problem 1


Using R, generate a random variable X that has 10,000 random uniform numbers from 1 to N, where N can be any number of your choosing greater than or equal to 6. Then generate a random variable Y that has 10,000 random normal numbers with a mean of \(\mu\)=\(\sigma \,\)=( N+1)/2.


#let N be 10
N <- 10

set.seed(1)
X <- runif(10000, 1, N)

set.seed(1)
Y <- rnorm(10000, (N+1)/2, (N+1)/2)

xmean <- mean(X)
ymean<-mean(Y)
xmedian <-median(X)
ymedian <-median(Y)

print(paste0("Mean of X is ", round(xmean,2)))
## [1] "Mean of X is 5.5"
print(paste0("Mean of Y is ", round(ymean,2)))
## [1] "Mean of Y is 5.46"
print(paste0("Median of X is ",round(xmedian,2) ))
## [1] "Median of X is 5.46"
print(paste0("Median of Y is ", round( ymedian,2)))
## [1] "Median of Y is 5.41"
#plots of the 2 distribution 
hist(X, col='#80cbc4', xlab="X", main="Uniform Distribution of X")

hist(Y, col='green', xlab="Y", main="Normal Distribution of Y")

Observation 1:


Probability. Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the median of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities.

  1. P(X>x | X>y)
    Using the conditionaly probability formula: \(P(A \mid B)\) = \(P(A \cap B)\)/\(P( B)\)

#creating dataframe for the uniform distribution of X only
df <- as.data.frame((X))
head(df)
##        (X)
## 1 3.389578
## 2 4.349115
## 3 6.155680
## 4 9.173870
## 5 2.815137
## 6 9.085507
#To determine 1st quartile of Y
summary(Y)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -14.692   1.796   5.412   5.464   9.227  26.457
y <- quantile(Y, 0.25)
y
##      25% 
## 1.796331
#To determine median or 2nd quartile quartile of X
summary(X)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.001   3.273   5.462   5.502   7.812   9.999
x <- quantile(X, 0.5)
x
##      50% 
## 5.461671
options(digits=4)
#because X is a uniform distribution, each little x has equal chance (rectangle distribution)
p1<-nrow(subset(df, X > x & X>y))/nrow(subset(df,X>y))
p1
## [1] 0.5499

  1. P(X>x, Y>y)

options(digits=4)
#For the uniform distribution
Px <- nrow(subset(df, X > x))/nrow(df)
#But since Y is normally distributed, lets find the z score first and use pnorm to obtain the probability of P(Y>y)
z <- ((y-(N+1)/2))/((N+1)/2)
Py <-1-pnorm(z) 
# Now apply the formula of P(A)*P(B)
Px*Py
##    25% 
## 0.3748

  1. P(X<x | X>y)

Apply the same principle of conditional probability as part 1a


options(digits=4)
p2<-nrow(subset(df, X < x & X>y))/nrow(subset(df,X>y))
p2
## [1] 0.4501

Investigate whether P(X>x and Y>y)=P(X>x)P(Y>y) by building a table and evaluating the marginal and joint probabilities.


n<-nrow(df)
c1<-nrow(subset(df, X < x & Y<y))/n
c2<-nrow(subset(df, X < x & Y>y))/n
c3<-c1+c2
c4<-nrow(subset(df, X > x & Y<y))/n
c5<-nrow(subset(df, X > x & Y>y))/n
c6<-c4+c5
c7<-c1+c4
c8<-c2+c5
c9<-c3+c6


dfcounts<-matrix(round(c(c1,c2,c3,c4,c5,c6,c7,c8,c9),3), ncol=3, nrow=3, byrow=T)
colnames(dfcounts)<-c(
"Y<y",
"Y>y",
"Total")
rownames(dfcounts)<-c("X<x","X>x","Total")

print("Contingency Table by Percentage")
## [1] "Contingency Table by Percentage"
dfcounts<-as.table(dfcounts)
dfcounts
##         Y<y   Y>y Total
## X<x   0.126 0.374 0.500
## X>x   0.124 0.376 0.500
## Total 0.250 0.750 1.000
options(digits=4)
#from percentage contigency table above
P_X_greater_x <- 0.5 #marginal across of the table (P(X>x))
P_Y_greater_y<- 0.75 #marginal vertical of the table (P(Y>y))
P_X_greater_x*P_Y_greater_y
## [1] 0.375

P(X>x and Y>y) by using joint probability of the contingency table => 0.376

P(X>x)P(Y>y), by using the marginal probability of the contigency table and multiplying them togetther, see above => 0.375

Ans: both are equal


Check to see if independence holds by using Fisher’s Exact Test and the Chi Square Test. What is the difference between the two? Which is most appropriate?

H0: X>x and Y>y are independent events HA: X>x and Y>y are dependent events


#Converting the percentage contingency table to counts to perform Fisher and Chi Square tests of independence
print("Contingency Table by Counts")
## [1] "Contingency Table by Counts"
dfvals<-round(dfcounts*10000,0)
#subsetting contingency table for only the relevant counts
dfcountssubset <- dfvals[c(1:2),c(1:2)]
fisher.test(dfcountssubset )#Fisher Test
## 
##  Fisher's Exact Test for Count Data
## 
## data:  dfcountssubset
## p-value = 0.7
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.9321 1.1195
## sample estimates:
## odds ratio 
##      1.022
chisq.test(dfcountssubset)#Chi square test
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  dfcountssubset
## X-squared = 0.19, df = 1, p-value = 0.7

Conclusion:


Problem # 2: Advanced Regression Techniques competition in kaggle.com

url1<- "https://raw.githubusercontent.com/ssufian/DAT-605/master/train.csv"

train_df <- read.csv(url1, header = TRUE, sep = ",",stringsAsFactors = FALSE)

#replace NA with zeros for numeric columns
dfnum<-train_df %>% mutate_all(~replace(., is.na(.), 0))
#checking dimensions of dataframe
nrow(dfnum)
## [1] 1460
ncol(dfnum)
## [1] 81
#Taking only columns with numerical data
dfnum1<- dfnum[c(2,4,5, 18:21, 27,35, 37:39, 44:53, 55, 57, 60, 62, 63, 67, 72, 76:78, 81)]

df_unistats<- basicStats(dfnum1)[c("Minimum", "Maximum", "1. Quartile", "3. Quartile", "Mean", "Median",
                               "Variance", "Stdev", "Skewness", "Kurtosis"), ] %>% t() %>% as.data.frame()

kable(df_unistats)
Minimum Maximum
  1. Quartile
  1. Quartile
Mean Median Variance Stdev Skewness Kurtosis
MSSubClass 20 190 20.0 70.0 5.690e+01 50.0 1.789e+03 4.230e+01 1.4048 1.5644
LotFrontage 0 313 42.0 79.0 5.762e+01 63.0 1.202e+03 3.466e+01 0.2673 3.5852
LotArea 1300 215245 7553.5 11601.5 1.052e+04 9478.5 9.963e+07 9.981e+03 12.1826 202.2623
OverallQual 1 10 5.0 7.0 6.099e+00 6.0 1.913e+00 1.383e+00 0.2165 0.0876
OverallCond 1 9 5.0 6.0 5.575e+00 5.0 1.238e+00 1.113e+00 0.6916 1.0929
YearBuilt 1872 2010 1954.0 2000.0 1.971e+03 1973.0 9.122e+02 3.020e+01 -0.6122 -0.4457
YearRemodAdd 1950 2010 1967.0 2004.0 1.985e+03 1994.0 4.262e+02 2.065e+01 -0.5025 -1.2744
MasVnrArea 0 1600 0.0 164.2 1.031e+02 0.0 3.266e+04 1.807e+02 2.6721 10.0847
BsmtFinSF1 0 5644 0.0 712.2 4.436e+02 383.5 2.080e+05 4.561e+02 1.6820 11.0568
BsmtFinSF2 0 1474 0.0 0.0 4.655e+01 0.0 2.602e+04 1.613e+02 4.2465 20.0089
BsmtUnfSF 0 2336 223.0 808.0 5.672e+02 477.5 1.952e+05 4.419e+02 0.9184 0.4645
TotalBsmtSF 0 6110 795.8 1298.2 1.057e+03 991.5 1.925e+05 4.387e+02 1.5211 13.1789
X1stFlrSF 334 4692 882.0 1391.2 1.163e+03 1087.0 1.495e+05 3.866e+02 1.3739 5.7101
X2ndFlrSF 0 2065 0.0 728.0 3.470e+02 0.0 1.906e+05 4.365e+02 0.8114 -0.5590
LowQualFinSF 0 572 0.0 0.0 5.845e+00 0.0 2.364e+03 4.862e+01 8.9928 82.8282
GrLivArea 334 5642 1129.5 1776.8 1.515e+03 1464.0 2.761e+05 5.255e+02 1.3638 4.8635
BsmtFullBath 0 3 0.0 1.0 4.253e-01 0.0 2.693e-01 5.189e-01 0.5948 -0.8433
BsmtHalfBath 0 2 0.0 0.0 5.750e-02 0.0 5.700e-02 2.388e-01 4.0950 16.3100
FullBath 0 3 1.0 2.0 1.565e+00 2.0 3.035e-01 5.509e-01 0.0365 -0.8612
HalfBath 0 2 0.0 1.0 3.829e-01 0.0 2.529e-01 5.029e-01 0.6745 -1.0800
BedroomAbvGr 0 8 2.0 3.0 2.866e+00 3.0 6.655e-01 8.158e-01 0.2114 2.2120
KitchenAbvGr 0 3 1.0 1.0 1.047e+00 1.0 4.850e-02 2.203e-01 4.4792 21.4211
TotRmsAbvGrd 2 14 5.0 7.0 6.518e+00 6.0 2.642e+00 1.625e+00 0.6750 0.8683
Fireplaces 0 3 0.0 1.0 6.130e-01 1.0 4.156e-01 6.447e-01 0.6482 -0.2244
GarageYrBlt 0 2010 1958.0 2001.0 1.869e+03 1977.0 2.058e+05 4.537e+02 -3.8616 12.9726
GarageCars 0 4 1.0 2.0 1.767e+00 2.0 5.585e-01 7.473e-01 -0.3418 0.2117
GarageArea 0 1418 334.5 576.0 4.730e+02 480.0 4.571e+04 2.138e+02 0.1796 0.9045
WoodDeckSF 0 857 0.0 168.0 9.424e+01 0.0 1.571e+04 1.253e+02 1.5382 2.9704
PoolArea 0 738 0.0 0.0 2.759e+00 0.0 1.614e+03 4.018e+01 14.7979 222.1917
MiscVal 0 15500 0.0 0.0 4.349e+01 0.0 2.461e+05 4.961e+02 24.4265 697.6401
MoSold 1 12 5.0 8.0 6.322e+00 6.0 7.310e+00 2.704e+00 0.2116 -0.4104
YrSold 2006 2010 2007.0 2009.0 2.008e+03 2008.0 1.764e+00 1.328e+00 0.0961 -1.1931
SalePrice 34900 755000 129975.0 214000.0 1.809e+05 163000.0 6.311e+09 7.944e+04 1.8790 6.4968
# Box plots 
meltData <- melt(dfnum1)

p <- ggplot(meltData, aes(factor(variable), value)) 
p + geom_boxplot() + facet_wrap(~variable, scale="free")+theme_economist()+scale_colour_economist()

#density plots
density <- dfnum1 %>%
  gather() %>%                         
  ggplot(aes(value)) +                 
    facet_wrap(~ key, scales = "free") +
    geom_density()+theme_economist()+scale_colour_economist()
density

Correlation for all Independent variables that are numerical

cormatrix<-cor(dfnum1)
cordf<-as.data.frame(cormatrix)
kable(head(cordf))
MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd Fireplaces GarageYrBlt GarageCars GarageArea WoodDeckSF PoolArea MiscVal MoSold YrSold SalePrice
MSSubClass 1.0000 -0.2150 -0.1398 0.0326 -0.0593 0.0279 0.0406 0.0236 -0.0698 -0.0656 -0.1408 -0.2385 -0.2518 0.3079 0.0465 0.0749 0.0035 -0.0023 0.1316 0.1774 -0.0234 0.2817 0.0404 -0.0456 -0.0810 -0.0401 -0.0987 -0.0126 0.0083 -0.0077 -0.0136 -0.0214 -0.0843
LotFrontage -0.2150 1.0000 0.1007 0.1766 -0.0535 0.0369 0.0787 0.1050 0.0767 -0.0093 0.1608 0.2383 0.2452 0.0425 0.0500 0.2203 0.0105 -0.0279 0.1205 -0.0130 0.1445 0.0344 0.2214 0.0440 0.0193 0.1652 0.2015 -0.0168 0.1141 -0.0596 0.0189 -0.0121 0.2096
LotArea -0.1398 0.1007 1.0000 0.1058 -0.0056 0.0142 0.0138 0.1033 0.2141 0.1112 -0.0026 0.2608 0.2995 0.0510 0.0048 0.2631 0.1582 0.0480 0.1260 0.0143 0.1197 -0.0178 0.1900 0.2714 0.0726 0.1549 0.1804 0.1717 0.0777 0.0381 0.0012 -0.0143 0.2638
OverallQual 0.0326 0.1766 0.1058 1.0000 -0.0919 0.5723 0.5507 0.4073 0.2397 -0.0591 0.3082 0.5378 0.4762 0.2955 -0.0304 0.5930 0.1111 -0.0402 0.5506 0.2735 0.1017 -0.1839 0.4275 0.3968 0.2890 0.6007 0.5620 0.2389 0.0652 -0.0314 0.0708 -0.0273 0.7910
OverallCond -0.0593 -0.0535 -0.0056 -0.0919 1.0000 -0.3760 0.0737 -0.1257 -0.0462 0.0402 -0.1368 -0.1711 -0.1442 0.0289 0.0255 -0.0797 -0.0549 0.1178 -0.1941 -0.0608 0.0130 -0.0870 -0.0576 -0.0238 -0.0065 -0.1858 -0.1515 -0.0033 -0.0020 0.0688 -0.0035 0.0439 -0.0779
YearBuilt 0.0279 0.0369 0.0142 0.5723 -0.3760 1.0000 0.5929 0.3116 0.2495 -0.0491 0.1490 0.3915 0.2820 0.0103 -0.1838 0.1990 0.1876 -0.0382 0.4683 0.2427 -0.0707 -0.1748 0.0956 0.1477 0.2720 0.5379 0.4790 0.2249 0.0049 -0.0344 0.0124 -0.0136 0.5229

Independent Variables with correlation greater than 0.5 (multicollinearity check) to each other

cordf[cordf == 1] <- NA #drop correlation of 1, diagonals
cordf[abs(cordf) < 0.5] <- NA # drop correlations of less than 0.5
cordf<-as.matrix(cordf)
cordf2<- na.omit(melt(cordf)) 
kable(head(cordf2[order(-abs(cordf2$value)),])) # sort by highest correlations
Var1 Var2 value
852 GarageArea GarageCars 0.8825
884 GarageCars GarageArea 0.8825
518 TotRmsAbvGrd GrLivArea 0.8255
742 GrLivArea TotRmsAbvGrd 0.8255
376 X1stFlrSF TotalBsmtSF 0.8195
408 TotalBsmtSF X1stFlrSF 0.8195

Correlation Matrix for Some Significant Relationships

Test the hypotheses that the correlations between each pairwise set of variables (see below) is 0 and provide a 80% confidence interval.

headers = c("GarageCars","GrLivArea","GarageArea","TotRmsAbvGrd","TotalBsmtSF","X1stFlrSF")

newdata <- dfnum[headers]
correlation = cor(newdata )
corrplot(correlation, method="pie")

#hypotheses testing
#X1stFlrSF with TotalBsmtSF
cor.test(dfnum1$X1stFlrSF, dfnum1$TotalBsmtSF, method = c("pearson", "kendall", "spearman"), conf.level = 0.8)
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 55, df = 1458, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.8082 0.8303
## sample estimates:
##    cor 
## 0.8195
#TotRmsAbvGrd with  GrLivArea
cor.test(dfnum1$TotRmsAbvGrd , dfnum1$GrLivArea, method = c("pearson", "kendall", "spearman"), conf.level = 0.8)
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 56, df = 1458, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.8145 0.8359
## sample estimates:
##    cor 
## 0.8255
#TGarageArea with GarageCars
cor.test(dfnum1$GarageArea , dfnum1$GarageCars, method = c("pearson", "kendall", "spearman"), conf.level = 0.8)
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 72, df = 1458, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.8748 0.8897
## sample estimates:
##    cor 
## 0.8825

Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

ans:

-In all three instances the p-values are very small; The p-values of all tests is less than the significance level 0.05. We can conclude that variables are significantly correlated. We can reject the the null hypothesis and conclude that the true correlation is not 0 for the selected variables.

With such low p-values and the correlation matrix already showing how correlated they are, I wouldn’t worried about familywise error in this case but one may be curious about familywise error.

according to https://www.statisticshowto.com/familywise-error-rate/

The familywise error rate (FWE or FWER) is the probability of a coming to at least one false conclusion in a series of hypothesis tests . In other words, it’s the probability of making at least one Type I Error. The term “familywise” error rate comes from family of tests, which is the technical definition for a series of tests on data.

Formula: The formula to estimate the familywise error rate is: FWER ≤ 1 – (1 – α)^c

Where:

α = alpha level for an individual test (e.g. .05), c = Number of comparisons.

From the equation above, FWER is a function of c which is the number of comparisions & alpha. If an overall type I error rate of 0.05 is what you want, then about 5 comparision tests with an individual reduced alpha rate of 0.01 would be needed, see graph below


#plotting FWE vs c
alpha = 0.01
eq = function(c){1-(1-alpha)^c}
ggplot(data.frame(x=c(1, 10)), aes(x=x)) + 
  stat_function(fun=eq)+theme_economist()+scale_colour_economist()+  xlab("C, Number of comparision tests") +
  ylab("Family wise Error rate %") +
  ggtitle("Family-wise error (FWER) as function of Comparision Tests")+ geom_vline(xintercept = 5, linetype="dotted", 
                color = "red", size=1.5)+ geom_hline(yintercept=0.05, linetype="dotted", color = "red", size=1.5)


Independent variables with high correlation to Sale Price (dependent variable) that is greater than 0.5


cordf2<-as.data.frame(cordf2)
topcors <- cordf2[ which(cordf2$Var2=='SalePrice'),]

topcorsdf<-topcors[order(-abs(topcors$value)),]# sort by highest correlations

cors1<-data.frame(topcorsdf$Var1,topcorsdf$Var2,topcorsdf$value)
cors2<-rename(cors1,Correlation_to_Sale=topcorsdf.value,Independent_variables=topcorsdf.Var1)
kable(cors2)
Independent_variables topcorsdf.Var2 Correlation_to_Sale
OverallQual SalePrice 0.7910
GrLivArea SalePrice 0.7086
GarageCars SalePrice 0.6404
GarageArea SalePrice 0.6234
TotalBsmtSF SalePrice 0.6136
X1stFlrSF SalePrice 0.6059
FullBath SalePrice 0.5607
TotRmsAbvGrd SalePrice 0.5337
YearBuilt SalePrice 0.5229
YearRemodAdd SalePrice 0.5071

Correlation Plot of Sale Price w.r.t. Independent Variables greater than 0.5

p<-ggplot(data=cors2, aes(x=cors2$Independent_variables, y=cors2$Correlation_to_Sale,fill=cors2$Independent_variable)) +
  geom_bar(stat="identity")+ggtitle("Fig1: Top ten Correlation Plot of Sale Price vs. Independent Variables")

# Horizontal bar plot
p + coord_flip()+theme_economist()+scale_colour_economist()

Provide a scatterplot matrix for at least two of the independent variables and the dependent variable

ggplot(dfnum1, aes(x=OverallQual, y=SalePrice)) + geom_point(shape=1) +
    geom_smooth(method=lm,   # Add linear regression lines
                se=T,    # Don't add shaded confidence region
                fullrange=TRUE)+theme_economist()+scale_colour_economist() +ggtitle("Fig2a: Scatter plot of Overal Qual vs Sales price")

ggplot(dfnum1, aes(x=GrLivArea, y=SalePrice)) + geom_point(shape=1) +
    geom_smooth(method=lm,   # Add linear regression lines
                se=T,    # Don't add shaded confidence region
                fullrange=TRUE)+theme_economist()+scale_colour_economist() +ggtitle("Fig2b: Scatter plot of GrLivArea vs Sales price")


Linear Algebra and Correlation Matrix Decomposition exercise.


#inverse of correlation matrix
correlation_inverted = solve(correlation)
correlation_inverted
##              GarageCars GrLivArea GarageArea TotRmsAbvGrd TotalBsmtSF X1stFlrSF
## GarageCars       4.6265   -0.1360    -3.9723     -0.26552     -0.0630   0.15041
## GrLivArea       -0.1360    4.1111    -0.3814     -2.85138     -0.1918  -0.75554
## GarageArea      -3.9723   -0.3814     4.8783      0.31841     -0.3269  -0.29078
## TotRmsAbvGrd    -0.2655   -2.8514     0.3184      3.26498      0.3969  -0.08771
## TotalBsmtSF     -0.0630   -0.1918    -0.3269      0.39692      3.1998  -2.48847
## X1stFlrSF        0.1504   -0.7555    -0.2908     -0.08771     -2.4885   3.57928
#multiply the precision matrix by the correlation matrix
correlation_inverted %*% correlation
##              GarageCars  GrLivArea GarageArea TotRmsAbvGrd TotalBsmtSF
## GarageCars    1.000e+00 -6.939e-17  1.665e-16   -4.163e-17  -2.776e-17
## GrLivArea    -3.331e-16  1.000e+00 -2.220e-16    3.886e-16  -1.110e-16
## GarageArea    2.498e-16  5.551e-17  1.000e+00    1.110e-16   2.776e-17
## TotRmsAbvGrd -2.012e-16 -2.637e-16  8.327e-17    1.000e+00  -2.776e-17
## TotalBsmtSF   4.441e-16  2.220e-16  4.441e-16    0.000e+00   1.000e+00
## X1stFlrSF     0.000e+00  0.000e+00  0.000e+00    0.000e+00  -4.441e-16
##               X1stFlrSF
## GarageCars    1.110e-16
## GrLivArea     2.220e-16
## GarageArea   -5.551e-17
## TotRmsAbvGrd -3.192e-16
## TotalBsmtSF   8.882e-16
## X1stFlrSF     1.000e+00
#multiply the correlation matrix by the precision matrix
correlation %*% correlation_inverted
##              GarageCars GrLivArea GarageArea TotRmsAbvGrd TotalBsmtSF X1stFlrSF
## GarageCars    1.000e+00 0.000e+00  3.053e-16   -1.388e-17   0.000e+00         0
## GrLivArea    -3.192e-16 1.000e+00  1.110e-16    1.457e-16   0.000e+00         0
## GarageArea   -2.914e-16 5.551e-17  1.000e+00   -1.735e-16   0.000e+00         0
## TotRmsAbvGrd -2.429e-16 5.551e-17  1.943e-16    1.000e+00   0.000e+00         0
## TotalBsmtSF  -3.053e-16 0.000e+00  3.608e-16    1.249e-16   1.000e+00         0
## X1stFlrSF    -3.886e-16 2.220e-16  2.776e-16    5.551e-17   4.441e-16         1

Observation 2:



A <-correlation # let the correlation be assinged to matrix A
luDec <- lu(A)
L <- expand(luDec)$L
L #lower triangle matrix
## 6 x 6 Matrix of class "dtrMatrix" (unitriangular)
##      [,1]     [,2]     [,3]     [,4]     [,5]     [,6]    
## [1,]  1.00000        .        .        .        .        .
## [2,]  0.46725  1.00000        .        .        .        .
## [3,]  0.88248  0.07249  1.00000        .        .        .
## [4,]  0.36229  0.83949 -0.13566  1.00000        .        .
## [5,]  0.43458  0.32214  0.39102 -0.22858  1.00000        .
## [6,]  0.43932  0.46151  0.34977 -0.13442  0.69524  1.00000
U <- expand(luDec)$U
U#Mupper triangle matrix
## 6 x 6 Matrix of class "dtrMatrix"
##      [,1]     [,2]     [,3]     [,4]     [,5]     [,6]    
## [1,]  1.00000  0.46725  0.88248  0.36229  0.43458  0.43932
## [2,]        .  0.78168  0.05666  0.65621  0.25181  0.36075
## [3,]        .        .  0.21713 -0.02946  0.08490  0.07594
## [4,]        .        .        .  0.31387 -0.07175 -0.04219
## [5,]        .        .        .        .  0.68042  0.47306
## [6,]        .        .        .        .        .  0.27939
 P <- expand(luDec)$P
P#sparse matrix
## 6 x 6 sparse Matrix of class "pMatrix"
##                 
## [1,] | . . . . .
## [2,] . | . . . .
## [3,] . . | . . .
## [4,] . . . | . .
## [5,] . . . . | .
## [6,] . . . . . |
L %*% U
## 6 x 6 Matrix of class "dgeMatrix"
##        [,1]   [,2]   [,3]   [,4]   [,5]   [,6]
## [1,] 1.0000 0.4672 0.8825 0.3623 0.4346 0.4393
## [2,] 0.4672 1.0000 0.4690 0.8255 0.4549 0.5660
## [3,] 0.8825 0.4690 1.0000 0.3378 0.4867 0.4898
## [4,] 0.3623 0.8255 0.3378 1.0000 0.2856 0.4095
## [5,] 0.4346 0.4549 0.4867 0.2856 1.0000 0.8195
## [6,] 0.4393 0.5660 0.4898 0.4095 0.8195 1.0000
P %*% L %*% U # the LU decomposition 
## 6 x 6 Matrix of class "dgeMatrix"
##        [,1]   [,2]   [,3]   [,4]   [,5]   [,6]
## [1,] 1.0000 0.4672 0.8825 0.3623 0.4346 0.4393
## [2,] 0.4672 1.0000 0.4690 0.8255 0.4549 0.5660
## [3,] 0.8825 0.4690 1.0000 0.3378 0.4867 0.4898
## [4,] 0.3623 0.8255 0.3378 1.0000 0.2856 0.4095
## [5,] 0.4346 0.4549 0.4867 0.2856 1.0000 0.8195
## [6,] 0.4393 0.5660 0.4898 0.4095 0.8195 1.0000
correlation #the original correlation matrix, which was assinged to A
##              GarageCars GrLivArea GarageArea TotRmsAbvGrd TotalBsmtSF X1stFlrSF
## GarageCars       1.0000    0.4672     0.8825       0.3623      0.4346    0.4393
## GrLivArea        0.4672    1.0000     0.4690       0.8255      0.4549    0.5660
## GarageArea       0.8825    0.4690     1.0000       0.3378      0.4867    0.4898
## TotRmsAbvGrd     0.3623    0.8255     0.3378       1.0000      0.2856    0.4095
## TotalBsmtSF      0.4346    0.4549     0.4867       0.2856      1.0000    0.8195
## X1stFlrSF        0.4393    0.5660     0.4898       0.4095      0.8195    1.0000

Calculus-Based Probability & Statistics.


#plot of histogram of year built variable
hist(dfnum1$SalePrice , col = 'yellow', main = 'Fig3a: SalePrice- One of Indpendent variables to Investigate', breaks = 30)

summary(dfnum1$SalePrice ) # original distribution of saleprice
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129975  163000  180921  214000  755000


library(MASS)# load the MASS library per instruction

edist<-fitdistr(dfnum1$SalePrice, densfun = 'exponential')
lambda <- edist$estimate
exp_distibution <- rexp(1000, lambda)

summary(exp_distibution)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      15   54796  125590  184969  252524 2025507
#Optimal lambda value
print("The fitted optimal lambda based on N = 1000 samples")
## [1] "The fitted optimal lambda based on N = 1000 samples"
lambda 
##      rate 
## 5.527e-06


par(mfrow=c(1,2))
#original histogram without exponential fitting
hist(dfnum1$SalePrice , col = 'yellow', main = 'Fig3a: SalePrice-The original SalePrice', breaks = 30)
#higtogram with exponential fitting
hist(exp_distibution, freq = FALSE, breaks = 100, main ="Fig3b: Fitted Exponential PDF SalePrice",col = "blue", xlim = c(1, quantile(exp_distibution, 0.99)))
curve(dexp(x, rate = lambda), col = "red", add = TRUE)

#Generating the 5th and 95th percentiles
print("The 95% confidence interval of exponential distribution with the optimal lambda")
## [1] "The 95% confidence interval of exponential distribution with the optimal lambda"
qexp(c(0.05,0.95), lambda)
## [1]   9280 541991
#Generating 95% confidence level from the data, assuming that the distribution is normal
print("The 95% confidence interval of original sales price distribution assuming normality")
## [1] "The 95% confidence interval of original sales price distribution assuming normality"
qnorm(c(0.05, 0.95), mean = mean(dfnum1$SalePrice), sd = sd(dfnum1$SalePrice))
## [1]  50250 311592
#the empirical 5th percentile and 95th percentile of the data
print("The empirically generated 95% confidence interval of actual sales price data")
## [1] "The empirically generated 95% confidence interval of actual sales price data"
quantile(dfnum1$SalePrice, c(0.05, 0.95))
##     5%    95% 
##  88000 326100

Let’s break this down one item at a time:

Given the 2 lambdas are approximately the same, we can say 5.527e-06 was the optimal lambda

Based on these samples, the best estimate would be the sample mean (in this case is 180921). That serves as a good point of estimate but it would be useful to also communicate how uncertain we are of that estimate.

This can be captured by using a confidence interval. We showed the calculated 95% confidence intervals (3 ways) for the sample mean by adding and subtracting 1.96 standard errors to the point estimate.

This is an important inference that we’ve just made: even though we don’t know the full population, we’re 95% confident that the true mean as estimated by the sample means of 180921 lies between any of the 3 confidence intervals shown above.


Modeling in Kaggle


#these predictors were found to have high correlation to sales price, see Fig1
headers1 = c("GarageArea","GrLivArea","FullBath","OverallQual","TotalBsmtSF","YearBuilt","YearRemodAdd")
# This is to ensure we do not incidently picked up predictors with high VIF
newdata1 <- dfnum[headers1]
correlation1 = cor(newdata1 )
print("Correlational relationships of Predictors with each other")
## [1] "Correlational relationships of Predictors with each other"
correlation1
##              GarageArea GrLivArea FullBath OverallQual TotalBsmtSF YearBuilt
## GarageArea       1.0000    0.4690   0.4057      0.5620      0.4867    0.4790
## GrLivArea        0.4690    1.0000   0.6300      0.5930      0.4549    0.1990
## FullBath         0.4057    0.6300   1.0000      0.5506      0.3237    0.4683
## OverallQual      0.5620    0.5930   0.5506      1.0000      0.5378    0.5723
## TotalBsmtSF      0.4867    0.4549   0.3237      0.5378      1.0000    0.3915
## YearBuilt        0.4790    0.1990   0.4683      0.5723      0.3915    1.0000
## YearRemodAdd     0.3716    0.2874   0.4390      0.5507      0.2911    0.5929
##              YearRemodAdd
## GarageArea         0.3716
## GrLivArea          0.2874
## FullBath           0.4390
## OverallQual        0.5507
## TotalBsmtSF        0.2911
## YearBuilt          0.5929
## YearRemodAdd       1.0000
#fit the regression model
model <- lm(SalePrice ~ GarageArea+ GrLivArea + FullBath + OverallQual+TotalBsmtSF+YearBuilt+YearRemodAdd ,data = dfnum1)
#view the output of the regression model
summary(model)
## 
## Call:
## lm(formula = SalePrice ~ GarageArea + GrLivArea + FullBath + 
##     OverallQual + TotalBsmtSF + YearBuilt + YearRemodAdd, data = dfnum1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -519713  -19055   -2564   14943  289099 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.21e+06   1.28e+05   -9.50  < 2e-16 ***
## GarageArea    4.56e+01   6.14e+00    7.41  2.1e-13 ***
## GrLivArea     5.40e+01   3.01e+00   17.96  < 2e-16 ***
## FullBath     -5.45e+03   2.66e+03   -2.05     0.04 *  
## OverallQual   1.98e+04   1.18e+03   16.77  < 2e-16 ***
## TotalBsmtSF   2.79e+01   2.88e+00    9.68  < 2e-16 ***
## YearBuilt     2.84e+02   4.98e+01    5.70  1.5e-08 ***
## YearRemodAdd  2.98e+02   6.40e+01    4.65  3.6e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38100 on 1452 degrees of freedom
## Multiple R-squared:  0.771,  Adjusted R-squared:  0.769 
## F-statistic:  696 on 7 and 1452 DF,  p-value: <2e-16
#calculate the VIF for each predictor variable in the model
 VIF <- function(linear.model, no.intercept=FALSE, all.diagnostics=FALSE, plot=FALSE) {
    require(mctest)
    if(no.intercept==FALSE) design.matrix <- model.matrix(linear.model)[,-1]
    if(no.intercept==TRUE) design.matrix <- model.matrix(linear.model)
    if(plot==TRUE) mc.plot(design.matrix,linear.model$model[1])
    if(all.diagnostics==FALSE) output <- imcdiag(design.matrix,linear.model$model[1], method='VIF')$idiags[,1]
    if(all.diagnostics==TRUE) output <- imcdiag(design.matrix,linear.model$model[1])
    output
 }


VIF(model)
## Loading required package: mctest
##   GarageArea    GrLivArea     FullBath  OverallQual  TotalBsmtSF    YearBuilt 
##        1.730        2.503        2.148        2.662        1.599        2.268 
## YearRemodAdd 
##        1.750
#create vector of VIF values
vif_values <- VIF(model)
#create horizontal bar chart to display each VIF value
barplot(vif_values, main = "Fig4: VIF Values on Model 1", horiz = TRUE, col = "gold",las=2,cex.names=.53,xlim=c(0,5))
#add vertical line at 5
abline(v = 5, lwd = 3, lty = 2)

- The above VIF analysis and the correlation table showed the 7 predictors chosen do not have significant multi-collinearity (low VIF < 5) and therefore would make good predictors for the dependent variable

Statistical Diagnostics

The reason for these set of tests is to ensure the model used is correctly specified


# Compute the analysis of variance
res.aov <- aov(model, data = dfnum1)
# Summary of the analysis
summary(res.aov)
##                Df   Sum Sq  Mean Sq F value  Pr(>F)    
## GarageArea      1 3.58e+12 3.58e+12  2459.2 < 2e-16 ***
## GrLivArea       1 2.05e+12 2.05e+12  1405.3 < 2e-16 ***
## FullBath        1 8.38e+10 8.38e+10    57.6 5.8e-14 ***
## OverallQual     1 1.10e+12 1.10e+12   757.1 < 2e-16 ***
## TotalBsmtSF     1 1.67e+11 1.67e+11   114.7 < 2e-16 ***
## YearBuilt       1 8.70e+10 8.70e+10    59.8 2.0e-14 ***
## YearRemodAdd    1 3.15e+10 3.15e+10    21.6 3.6e-06 ***
## Residuals    1452 2.11e+12 1.46e+09                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 1. Homogeneity of variances check
plot(res.aov, 1)

# 2. Normality of residual check
plot(res.aov, 2)

#histogram of residuals
ggplot(model, aes(x=.resid))+geom_histogram(binwidth = 30)+ geom_histogram(bins=30)


Summary of statistical diagnostics & overall evaluation on Model 1:

  1. Apparently, the adjusted \(R^{2}\) of 0.771 and F-stats with low p-value seems to indicate a model with high predictive power
  2. All the coeficients of the model have statistical significance since their T-Stats are > 2 with corresponding low p-values (<0.05)
  3. The residuals of the model looks awful though; (A) The residuals exhibit heterodasticity (B) Normality of residuals is Not met (C) Histogram of residuals revealed a few outliers as well (May or may not need corrective actions)

Overall, model 1 is not entirely correctly specified. Its good but not great.

The prescribed solution is to performed some data transforms on some or all of the predictor variables to improve the model

  1. From our initial box and density plots, we can see which prescription to apply to the predictor variables:

let’s start with the obvious first; taking logs of the data sets. Let’s review them and see if we still need to keep them or not


Transformed the predictors by taking the logs

#headers1 = c("GarageCars","GrLivArea","FullBath","OverallQual","TotalBsmtSF","YearBuilt","YearRemodAdd")
dfnum2<-fortify(dfnum1)
GrLivArealog= log(dfnum2$GrLivArea+min(dfnum2$GrLivArea))
TotalBsmtSFlog= log(dfnum2$TotalBsmtSF)
YearBuiltlog=log(dfnum2$YearBuilt)
YearRemodAddlog=log(dfnum2$YearRemodAdd)
Garagearealog=log(dfnum2$GarageArea)
#FullBathlog=log(dfnum2$FullBath)

Plotting the transformed data

par(mfrow=c(3,3))
hist(GrLivArealog , col = 'yellow', main = 'Fig5a: Log of GrLivArea', breaks = 30)
hist(TotalBsmtSFlog , col = 'green', main = 'Fig5b: Log of TotalBsmtSF', breaks = 30)
hist(YearBuiltlog, col = 'red', main = 'Fig5c: Log of YearBuilt', breaks = 30)
hist(YearRemodAddlog , col = 'orange', main = 'Fig5e: Log of YearRemodAdd', breaks = 30)
hist(Garagearealog , col = 'darkorchid2', main = 'Fig5f: Log of Garagearea', breaks = 30)

Taking the logs of YearRemodlAdd, Yearsbuilt and FullBath did not look promising.  So I decided to drop them as predictors in my next model
#fit the regression model again but with transformed data
 
#dfnum3<-dfnum2 %>% mutate(TotalBsmtSFlog=log(dfnum2$TotalBsmtSF),Garagearealog=log(dfnum2$GarageArea),GrLivArealog=log(dfnum2$GrLivArea))
#model1 <- lm((SalePrice)~ TotalBsmtSFlog+Garagearealog+GrLivArealog+OverallQual,data = dfnum3,na.action=na.exclude)
#view the output of the regression model with transformed data
#summary(model1)
I was getting this error: Error in lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) : NA/NaN/Inf in 'x' after taking the log of the predictors

Corrective Actions:


dfnum3<-dfnum2 %>% mutate(TotalBsmtSFlog=log(dfnum2$TotalBsmtSF+1),Garagearealog=log(dfnum2$GarageArea+1),GrLivArealog=log(dfnum2$GrLivArea+1))
#view the output of the regression model with transformed data
model1 <- lm((SalePrice)~ TotalBsmtSFlog+Garagearealog+GrLivArealog+OverallQual,data = dfnum3,na.action=na.exclude)
summary(model1)
## 
## Call:
## lm(formula = (SalePrice) ~ TotalBsmtSFlog + Garagearealog + GrLivArealog + 
##     OverallQual, data = dfnum3, na.action = na.exclude)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -263785  -24419   -2463   19308  357747 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -628841      28513  -22.05  < 2e-16 ***
## TotalBsmtSFlog     4123       1058    3.90  0.00010 ***
## Garagearealog      3108        848    3.66  0.00026 ***
## GrLivArealog      78772       4339   18.15  < 2e-16 ***
## OverallQual       31368       1116   28.11  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 43400 on 1455 degrees of freedom
## Multiple R-squared:  0.702,  Adjusted R-squared:  0.701 
## F-statistic:  858 on 4 and 1455 DF,  p-value: <2e-16
all(is.na(dfnum3$TotalBsmtSFlog)) #check to see if we have NA's
## [1] FALSE
res.aov1 <- aov(model1, data = dfnum3)
# Summary of the analysis
summary(res.aov1)
##                  Df   Sum Sq  Mean Sq F value Pr(>F)    
## TotalBsmtSFlog    1 9.78e+11 9.78e+11     519 <2e-16 ***
## Garagearealog     1 9.37e+11 9.37e+11     497 <2e-16 ***
## GrLivArealog      1 3.06e+12 3.06e+12    1625 <2e-16 ***
## OverallQual       1 1.49e+12 1.49e+12     790 <2e-16 ***
## Residuals      1455 2.74e+12 1.88e+09                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(res.aov1, 2)

plot(res.aov1, 1)


Stepwise Regression


#setting a "full model" with all the numeric predictors in it
full.model <- lm(SalePrice ~ MSSubClass + LotFrontage + LotArea + OverallQual 
                       + YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 +
                      BsmtFinSF2  + X1stFlrSF + X2ndFlrSF  +
                      BsmtFullBath + BedroomAbvGr  +
                      TotRmsAbvGrd + Fireplaces + GarageCars + WoodDeckSF +
                      PoolArea, data = dfnum3)
# Stepwise regression model- in both directions
step.model <- stepAIC(full.model, direction = "both", 
                      trace = FALSE,k=2)
summary(step.model)
## 
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotArea + OverallQual + 
##     YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 + X1stFlrSF + 
##     X2ndFlrSF + BsmtFullBath + BedroomAbvGr + TotRmsAbvGrd + 
##     Fireplaces + GarageCars + WoodDeckSF, data = dfnum3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -484184  -16921   -1841   14129  284221 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.04e+06   1.15e+05   -9.00  < 2e-16 ***
## MSSubClass   -1.99e+02   2.41e+01   -8.25  3.4e-16 ***
## LotArea       4.25e-01   1.01e-01    4.20  2.8e-05 ***
## OverallQual   1.97e+04   1.11e+03   17.76  < 2e-16 ***
## YearBuilt     2.02e+02   4.51e+01    4.48  8.2e-06 ***
## YearRemodAdd  3.00e+02   6.02e+01    4.99  6.9e-07 ***
## MasVnrArea    3.22e+01   5.93e+00    5.44  6.2e-08 ***
## BsmtFinSF1    1.28e+01   2.99e+00    4.30  1.8e-05 ***
## X1stFlrSF     5.16e+01   4.59e+00   11.24  < 2e-16 ***
## X2ndFlrSF     4.71e+01   4.09e+00   11.50  < 2e-16 ***
## BsmtFullBath  7.80e+03   2.40e+03    3.25   0.0012 ** 
## BedroomAbvGr -8.91e+03   1.67e+03   -5.33  1.1e-07 ***
## TotRmsAbvGrd  4.04e+03   1.18e+03    3.44   0.0006 ***
## Fireplaces    4.83e+03   1.72e+03    2.81   0.0050 ** 
## GarageCars    1.03e+04   1.71e+03    6.04  2.0e-09 ***
## WoodDeckSF    2.58e+01   7.88e+00    3.28   0.0011 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35200 on 1444 degrees of freedom
## Multiple R-squared:  0.806,  Adjusted R-squared:  0.804 
## F-statistic:  400 on 15 and 1444 DF,  p-value: <2e-16
#Note1: FullBath ,BsmtUnfSF & LowQualFinSF predictors were removed as it does not contribute significantly to the model
#Note2: KitchenAbvGr & OverallCond had low statistically insignificant F-values

Statistical Diagnostics Post-Step Wise Regression


res.aov3 <- aov(step.model, data = dfnum3)
# Summary of the analysis
summary(res.aov3)
##                Df   Sum Sq  Mean Sq F value  Pr(>F)    
## MSSubClass      1 6.54e+10 6.54e+10   52.86 5.9e-13 ***
## LotArea         1 5.97e+11 5.97e+11  482.16 < 2e-16 ***
## OverallQual     1 5.47e+12 5.47e+12 4418.09 < 2e-16 ***
## YearBuilt       1 8.54e+10 8.54e+10   69.02 < 2e-16 ***
## YearRemodAdd    1 3.74e+10 3.74e+10   30.23 4.5e-08 ***
## MasVnrArea      1 2.17e+11 2.17e+11  175.45 < 2e-16 ***
## BsmtFinSF1      1 1.68e+11 1.68e+11  135.55 < 2e-16 ***
## X1stFlrSF       1 1.79e+11 1.79e+11  144.54 < 2e-16 ***
## X2ndFlrSF       1 4.65e+11 4.65e+11  375.61 < 2e-16 ***
## BsmtFullBath    1 1.91e+10 1.91e+10   15.47 8.8e-05 ***
## BedroomAbvGr    1 3.12e+10 3.12e+10   25.21 5.8e-07 ***
## TotRmsAbvGrd    1 1.87e+10 1.87e+10   15.13 0.00011 ***
## Fireplaces      1 1.17e+10 1.17e+10    9.41 0.00219 ** 
## GarageCars      1 4.60e+10 4.60e+10   37.14 1.4e-09 ***
## WoodDeckSF      1 1.33e+10 1.33e+10   10.73 0.00108 ** 
## Residuals    1444 1.79e+12 1.24e+09                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(res.aov3, 2)

plot(res.aov3, 1)

ggplot(step.model, aes(x=.resid))+geom_histogram(binwidth = 30)+ geom_histogram(bins=30)

#Check for VIFs
VIF(step.model)
##   MSSubClass      LotArea  OverallQual    YearBuilt YearRemodAdd   MasVnrArea 
##        1.223        1.201        2.777        2.188        1.820        1.353 
##   BsmtFinSF1    X1stFlrSF    X2ndFlrSF BsmtFullBath BedroomAbvGr TotRmsAbvGrd 
##        2.187        3.713        3.768        1.825        2.194        4.304 
##   Fireplaces   GarageCars   WoodDeckSF 
##        1.447        1.918        1.149
#create vector of VIF values
vif_values1 <- VIF(step.model)

#create horizontal bar chart to display and check each VIF values again after step wise regression
barplot(vif_values1, main = "Fig6: VIF Values after step-wise regression", horiz = TRUE, col = "turquoise1",las=2,cex.names=.53,xlim=c(0,5))
#add vertical line at 5
abline(v = 5, lwd = 3, lty = 2)

#check if outliers needs our attention or not
outlierTest(step.model)
##      rstudent unadjusted p-value Bonferroni p
## 1299  -16.522          2.687e-56    3.923e-53
## 524   -10.337          3.267e-24    4.770e-21
## 1183    8.509          4.313e-17    6.298e-14
## 692     8.128          9.317e-16    1.360e-12
## 804     6.923          6.613e-12    9.655e-09
## 899     6.268          4.827e-10    7.048e-07
## 1047    6.154          9.790e-10    1.429e-06
## 1170    4.790          1.841e-06    2.688e-03
## 441     4.706          2.766e-06    4.039e-03

1) Slope of Coeficents are still signifcant T-stat > 2 with corresponding low p-values (<0.05)
2) F-values exhibit statistical significance
3) VIF values of predictors are all lower than 5
4) The adjusted R-Square of 0.804 is a marked improvement from Model 1 which was at 0.771
5) The residuals of model2 looks better though; (A) The residuals exhibit some heterodasticity but the curvature is "less" as predictor values increases (B) Normality of residuals is not perfect but saw a marked improvement (C) Histogram of residuals revealed a few outliers still and a significance of Outlier Tests were performed to see if it needed corrective actions
6)Several identified outliers had low p values which indicated that they do not seem to be significant (see last graph)

Lets apply to our final Model 2 the test data and make some predictions


# Read in from Kaggle the test data set
url2<- "https://raw.githubusercontent.com/ssufian/DAT-605/master/test.csv"
test_df <- read.csv(url2, header = TRUE, sep = ",",stringsAsFactors = FALSE)
#replace NA with zeros for numeric columns like in Model 1
test_df1<-test_df %>% mutate_all(~replace(., is.na(.), 0))
#Filter Test Data from Kaggle to reflect only the numerical predictors that we had used in the training set
test_df2<- dplyr::select(test_df1,Id,MSSubClass,LotFrontage,LotArea,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,X1stFlrSF,X2ndFlrSF ,BsmtFullBath ,BedroomAbvGr,TotRmsAbvGrd , Fireplaces ,GarageCars,WoodDeckSF,PoolArea)
#Make predictions with selected predictors
test_results<-predict(step.model, test_df2)
head(test_results) #checking at results
##      1      2      3      4      5      6 
## 113289 165950 175587 198751 189161 184604
#Making columns per Kaggle's instructions
prediction <- data.frame(Id = test_df2[,"Id"],  SalePrice = test_results)

head(prediction, 10)
##      Id SalePrice
## 1  1461    113289
## 2  1462    165950
## 3  1463    175587
## 4  1464    198751
## 5  1465    189161
## 6  1466    184604
## 7  1467    194618
## 8  1468    173252
## 9  1469    216060
## 10 1470    116969

Write into a csv to submit to Kaggle

write.csv(prediction, "salepricepredictions.csv")

Results from Kaggle

My Kaggle user name:

#

YouTube link: https://youtu.be/nZtMCDD_918