DAT 605 FINAL Spring 2020

DAT 605 FINAL SPRING 2020

library(dplyr)
library(knitr)
library(moments)
library(kableExtra)
library(reshape2)
library(ggplot2)
library(tidyr)
library(corrplot)
library(MASS)
library(pastecs)
library(psych)
library(e1071)
library(fBasics)
library(mosaic)
library(ggplot2)
require(ggthemes)
library(plogr)
require(Matrix)
library(onehot)
library(data.table)
library(mltools)
library(car)

Problem 1

Using R, generate a random variable X that has 10,000 random uniform numbers from 1 to N, where N can be any number of your choosing greater than or equal to 6. Then generate a random variable Y that has 10,000 random normal numbers with a mean of \(\mu\)=\(\sigma \,\)=( N+1)/2.

#let N be 10
N <- 10

set.seed(1)
X <- runif(10000, 1, N)

set.seed(1)
Y <- rnorm(10000, (N+1)/2, (N+1)/2)

xmean <- mean(X)
ymean<-mean(Y)
xmedian <-median(X)
ymedian <-median(Y)

print(paste0("Mean of X is ", round(xmean,2)))

## [1] "Mean of X is 5.5"

print(paste0("Mean of Y is ", round(ymean,2)))

## [1] "Mean of Y is 5.46"

print(paste0("Median of X is ",round(xmedian,2) ))

## [1] "Median of X is 5.46"

print(paste0("Median of Y is ", round( ymedian,2)))

## [1] "Median of Y is 5.41"

#plots of the 2 distribution 
hist(X, col='#80cbc4', xlab="X", main="Uniform Distribution of X")

hist(Y, col='green', xlab="Y", main="Normal Distribution of Y")

Observation 1:

The mean and median of the 2 distribution is the same because both the uniform and Normal distributions are symmetric, meaning that the median and mean are equal and all values in any range higher than the mean are equally likely as the corresponding range lower than the mean.

Probability. Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the median of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities.

P(X>x | X>y)
Using the conditionaly probability formula: \(P(A \mid B)\) = \(P(A \cap B)\)/\(P( B)\)

#creating dataframe for the uniform distribution of X only
df <- as.data.frame((X))
head(df)

##        (X)
## 1 3.389578
## 2 4.349115
## 3 6.155680
## 4 9.173870
## 5 2.815137
## 6 9.085507

#To determine 1st quartile of Y
summary(Y)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -14.692   1.796   5.412   5.464   9.227  26.457

y <- quantile(Y, 0.25)
y

##      25% 
## 1.796331

#To determine median or 2nd quartile quartile of X
summary(X)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.001   3.273   5.462   5.502   7.812   9.999

x <- quantile(X, 0.5)
x

##      50% 
## 5.461671

options(digits=4)
#because X is a uniform distribution, each little x has equal chance (rectangle distribution)
p1<-nrow(subset(df, X > x & X>y))/nrow(subset(df,X>y))
p1

## [1] 0.5499

P(X>x, Y>y)

As both are independent events, we can apply the formula: P(A and B) = P(A)*P(B)

options(digits=4)
#For the uniform distribution
Px <- nrow(subset(df, X > x))/nrow(df)
#But since Y is normally distributed, lets find the z score first and use pnorm to obtain the probability of P(Y>y)
z <- ((y-(N+1)/2))/((N+1)/2)
Py <-1-pnorm(z) 
# Now apply the formula of P(A)*P(B)
Px*Py

##    25% 
## 0.3748

P(X<x | X>y)

Apply the same principle of conditional probability as part 1a

options(digits=4)
p2<-nrow(subset(df, X < x & X>y))/nrow(subset(df,X>y))
p2

## [1] 0.4501

Investigate whether P(X>x and Y>y)=P(X>x)P(Y>y) by building a table and evaluating the marginal and joint probabilities.

n<-nrow(df)
c1<-nrow(subset(df, X < x & Y<y))/n
c2<-nrow(subset(df, X < x & Y>y))/n
c3<-c1+c2
c4<-nrow(subset(df, X > x & Y<y))/n
c5<-nrow(subset(df, X > x & Y>y))/n
c6<-c4+c5
c7<-c1+c4
c8<-c2+c5
c9<-c3+c6


dfcounts<-matrix(round(c(c1,c2,c3,c4,c5,c6,c7,c8,c9),3), ncol=3, nrow=3, byrow=T)
colnames(dfcounts)<-c(
"Y<y",
"Y>y",
"Total")
rownames(dfcounts)<-c("X<x","X>x","Total")

print("Contingency Table by Percentage")

## [1] "Contingency Table by Percentage"

dfcounts<-as.table(dfcounts)
dfcounts

##         Y<y   Y>y Total
## X<x   0.126 0.374 0.500
## X>x   0.124 0.376 0.500
## Total 0.250 0.750 1.000

options(digits=4)
#from percentage contigency table above
P_X_greater_x <- 0.5 #marginal across of the table (P(X>x))
P_Y_greater_y<- 0.75 #marginal vertical of the table (P(Y>y))
P_X_greater_x*P_Y_greater_y

## [1] 0.375

P(X>x and Y>y) by using joint probability of the contingency table => 0.376

P(X>x)P(Y>y), by using the marginal probability of the contigency table and multiplying them togetther, see above => 0.375

Ans: both are equal

Check to see if independence holds by using Fisher’s Exact Test and the Chi Square Test. What is the difference between the two? Which is most appropriate?

Fisher’s Test and Chi Square Test as such:

H0: X>x and Y>y are independent events HA: X>x and Y>y are dependent events

#Converting the percentage contingency table to counts to perform Fisher and Chi Square tests of independence
print("Contingency Table by Counts")

## [1] "Contingency Table by Counts"

dfvals<-round(dfcounts*10000,0)
#subsetting contingency table for only the relevant counts
dfcountssubset <- dfvals[c(1:2),c(1:2)]
fisher.test(dfcountssubset )#Fisher Test

## 
##  Fisher's Exact Test for Count Data
## 
## data:  dfcountssubset
## p-value = 0.7
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.9321 1.1195
## sample estimates:
## odds ratio 
##      1.022

chisq.test(dfcountssubset)#Chi square test

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  dfcountssubset
## X-squared = 0.19, df = 1, p-value = 0.7

Conclusion:

Both of the p values are higher than 0.05 which means we cannot reject the H0 (X>x and Y>y are independent events). Both tests leads to the same conclusion that they are independent
I read that Fisher test is appropriate for small sample size (<1000). For this exercise, we do not really have a large sample size and we have two nominal variables and we looking to see if the proportions of one are different depending on the other, I would go with Fisher Test. I also read that chi-square is usually more appropriate for categorical variables.

Problem # 2: Advanced Regression Techniques competition in kaggle.com

Loading Data from Kaggle

url1<- "https://raw.githubusercontent.com/ssufian/DAT-605/master/train.csv"

train_df <- read.csv(url1, header = TRUE, sep = ",",stringsAsFactors = FALSE)

#replace NA with zeros for numeric columns
dfnum<-train_df %>% mutate_all(~replace(., is.na(.), 0))
#checking dimensions of dataframe
nrow(dfnum)

## [1] 1460

ncol(dfnum)

## [1] 81

#Taking only columns with numerical data
dfnum1<- dfnum[c(2,4,5, 18:21, 27,35, 37:39, 44:53, 55, 57, 60, 62, 63, 67, 72, 76:78, 81)]

df_unistats<- basicStats(dfnum1)[c("Minimum", "Maximum", "1. Quartile", "3. Quartile", "Mean", "Median",
                               "Variance", "Stdev", "Skewness", "Kurtosis"), ] %>% t() %>% as.data.frame()

kable(df_unistats)

	Minimum	Maximum	Quartile	Quartile	Mean	Median	Variance	Stdev	Skewness	Kurtosis
MSSubClass	20	190	20.0	70.0	5.690e+01	50.0	1.789e+03	4.230e+01	1.4048	1.5644
LotFrontage	0	313	42.0	79.0	5.762e+01	63.0	1.202e+03	3.466e+01	0.2673	3.5852
LotArea	1300	215245	7553.5	11601.5	1.052e+04	9478.5	9.963e+07	9.981e+03	12.1826	202.2623
OverallQual	1	10	5.0	7.0	6.099e+00	6.0	1.913e+00	1.383e+00	0.2165	0.0876
OverallCond	1	9	5.0	6.0	5.575e+00	5.0	1.238e+00	1.113e+00	0.6916	1.0929
YearBuilt	1872	2010	1954.0	2000.0	1.971e+03	1973.0	9.122e+02	3.020e+01	-0.6122	-0.4457
YearRemodAdd	1950	2010	1967.0	2004.0	1.985e+03	1994.0	4.262e+02	2.065e+01	-0.5025	-1.2744
MasVnrArea	0	1600	0.0	164.2	1.031e+02	0.0	3.266e+04	1.807e+02	2.6721	10.0847
BsmtFinSF1	0	5644	0.0	712.2	4.436e+02	383.5	2.080e+05	4.561e+02	1.6820	11.0568
BsmtFinSF2	0	1474	0.0	0.0	4.655e+01	0.0	2.602e+04	1.613e+02	4.2465	20.0089
BsmtUnfSF	0	2336	223.0	808.0	5.672e+02	477.5	1.952e+05	4.419e+02	0.9184	0.4645
TotalBsmtSF	0	6110	795.8	1298.2	1.057e+03	991.5	1.925e+05	4.387e+02	1.5211	13.1789
X1stFlrSF	334	4692	882.0	1391.2	1.163e+03	1087.0	1.495e+05	3.866e+02	1.3739	5.7101
X2ndFlrSF	0	2065	0.0	728.0	3.470e+02	0.0	1.906e+05	4.365e+02	0.8114	-0.5590
LowQualFinSF	0	572	0.0	0.0	5.845e+00	0.0	2.364e+03	4.862e+01	8.9928	82.8282
GrLivArea	334	5642	1129.5	1776.8	1.515e+03	1464.0	2.761e+05	5.255e+02	1.3638	4.8635
BsmtFullBath	0	3	0.0	1.0	4.253e-01	0.0	2.693e-01	5.189e-01	0.5948	-0.8433
BsmtHalfBath	0	2	0.0	0.0	5.750e-02	0.0	5.700e-02	2.388e-01	4.0950	16.3100
FullBath	0	3	1.0	2.0	1.565e+00	2.0	3.035e-01	5.509e-01	0.0365	-0.8612
HalfBath	0	2	0.0	1.0	3.829e-01	0.0	2.529e-01	5.029e-01	0.6745	-1.0800
BedroomAbvGr	0	8	2.0	3.0	2.866e+00	3.0	6.655e-01	8.158e-01	0.2114	2.2120
KitchenAbvGr	0	3	1.0	1.0	1.047e+00	1.0	4.850e-02	2.203e-01	4.4792	21.4211
TotRmsAbvGrd	2	14	5.0	7.0	6.518e+00	6.0	2.642e+00	1.625e+00	0.6750	0.8683
Fireplaces	0	3	0.0	1.0	6.130e-01	1.0	4.156e-01	6.447e-01	0.6482	-0.2244
GarageYrBlt	0	2010	1958.0	2001.0	1.869e+03	1977.0	2.058e+05	4.537e+02	-3.8616	12.9726
GarageCars	0	4	1.0	2.0	1.767e+00	2.0	5.585e-01	7.473e-01	-0.3418	0.2117
GarageArea	0	1418	334.5	576.0	4.730e+02	480.0	4.571e+04	2.138e+02	0.1796	0.9045
WoodDeckSF	0	857	0.0	168.0	9.424e+01	0.0	1.571e+04	1.253e+02	1.5382	2.9704
PoolArea	0	738	0.0	0.0	2.759e+00	0.0	1.614e+03	4.018e+01	14.7979	222.1917
MiscVal	0	15500	0.0	0.0	4.349e+01	0.0	2.461e+05	4.961e+02	24.4265	697.6401
MoSold	1	12	5.0	8.0	6.322e+00	6.0	7.310e+00	2.704e+00	0.2116	-0.4104
YrSold	2006	2010	2007.0	2009.0	2.008e+03	2008.0	1.764e+00	1.328e+00	0.0961	-1.1931
SalePrice	34900	755000	129975.0	214000.0	1.809e+05	163000.0	6.311e+09	7.944e+04	1.8790	6.4968

# Box plots 
meltData <- melt(dfnum1)

p <- ggplot(meltData, aes(factor(variable), value)) 
p + geom_boxplot() + facet_wrap(~variable, scale="free")+theme_economist()+scale_colour_economist()

#density plots
density <- dfnum1 %>%
  gather() %>%                         
  ggplot(aes(value)) +                 
    facet_wrap(~ key, scales = "free") +
    geom_density()+theme_economist()+scale_colour_economist()
density

Correlation for all Independent variables that are numerical

cormatrix<-cor(dfnum1)
cordf<-as.data.frame(cormatrix)
kable(head(cordf))

	MSSubClass	LotFrontage	LotArea	OverallQual	OverallCond	YearBuilt	YearRemodAdd	MasVnrArea	BsmtFinSF1	BsmtFinSF2	BsmtUnfSF	TotalBsmtSF	X1stFlrSF	X2ndFlrSF	LowQualFinSF	GrLivArea	BsmtFullBath	BsmtHalfBath	FullBath	HalfBath	BedroomAbvGr	KitchenAbvGr	TotRmsAbvGrd	Fireplaces	GarageYrBlt	GarageCars	GarageArea	WoodDeckSF	PoolArea	MiscVal	MoSold	YrSold	SalePrice
MSSubClass	1.0000	-0.2150	-0.1398	0.0326	-0.0593	0.0279	0.0406	0.0236	-0.0698	-0.0656	-0.1408	-0.2385	-0.2518	0.3079	0.0465	0.0749	0.0035	-0.0023	0.1316	0.1774	-0.0234	0.2817	0.0404	-0.0456	-0.0810	-0.0401	-0.0987	-0.0126	0.0083	-0.0077	-0.0136	-0.0214	-0.0843
LotFrontage	-0.2150	1.0000	0.1007	0.1766	-0.0535	0.0369	0.0787	0.1050	0.0767	-0.0093	0.1608	0.2383	0.2452	0.0425	0.0500	0.2203	0.0105	-0.0279	0.1205	-0.0130	0.1445	0.0344	0.2214	0.0440	0.0193	0.1652	0.2015	-0.0168	0.1141	-0.0596	0.0189	-0.0121	0.2096
LotArea	-0.1398	0.1007	1.0000	0.1058	-0.0056	0.0142	0.0138	0.1033	0.2141	0.1112	-0.0026	0.2608	0.2995	0.0510	0.0048	0.2631	0.1582	0.0480	0.1260	0.0143	0.1197	-0.0178	0.1900	0.2714	0.0726	0.1549	0.1804	0.1717	0.0777	0.0381	0.0012	-0.0143	0.2638
OverallQual	0.0326	0.1766	0.1058	1.0000	-0.0919	0.5723	0.5507	0.4073	0.2397	-0.0591	0.3082	0.5378	0.4762	0.2955	-0.0304	0.5930	0.1111	-0.0402	0.5506	0.2735	0.1017	-0.1839	0.4275	0.3968	0.2890	0.6007	0.5620	0.2389	0.0652	-0.0314	0.0708	-0.0273	0.7910
OverallCond	-0.0593	-0.0535	-0.0056	-0.0919	1.0000	-0.3760	0.0737	-0.1257	-0.0462	0.0402	-0.1368	-0.1711	-0.1442	0.0289	0.0255	-0.0797	-0.0549	0.1178	-0.1941	-0.0608	0.0130	-0.0870	-0.0576	-0.0238	-0.0065	-0.1858	-0.1515	-0.0033	-0.0020	0.0688	-0.0035	0.0439	-0.0779
YearBuilt	0.0279	0.0369	0.0142	0.5723	-0.3760	1.0000	0.5929	0.3116	0.2495	-0.0491	0.1490	0.3915	0.2820	0.0103	-0.1838	0.1990	0.1876	-0.0382	0.4683	0.2427	-0.0707	-0.1748	0.0956	0.1477	0.2720	0.5379	0.4790	0.2249	0.0049	-0.0344	0.0124	-0.0136	0.5229

Independent Variables with correlation greater than 0.5 (multicollinearity check) to each other

cordf[cordf == 1] <- NA #drop correlation of 1, diagonals
cordf[abs(cordf) < 0.5] <- NA # drop correlations of less than 0.5
cordf<-as.matrix(cordf)
cordf2<- na.omit(melt(cordf)) 
kable(head(cordf2[order(-abs(cordf2$value)),])) # sort by highest correlations

	Var1	Var2	value
852	GarageArea	GarageCars	0.8825
884	GarageCars	GarageArea	0.8825
518	TotRmsAbvGrd	GrLivArea	0.8255
742	GrLivArea	TotRmsAbvGrd	0.8255
376	X1stFlrSF	TotalBsmtSF	0.8195
408	TotalBsmtSF	X1stFlrSF	0.8195

Correlation Matrix for Some Significant Relationships

Test the hypotheses that the correlations between each pairwise set of variables (see below) is 0 and provide a 80% confidence interval.

GarageArea with GarageCars
TotRmsAbvGrd with GrLivArea
X1stFlrSF with TotalBsmtSF

headers = c("GarageCars","GrLivArea","GarageArea","TotRmsAbvGrd","TotalBsmtSF","X1stFlrSF")

newdata <- dfnum[headers]
correlation = cor(newdata )
corrplot(correlation, method="pie")

#hypotheses testing
#X1stFlrSF with TotalBsmtSF
cor.test(dfnum1$X1stFlrSF, dfnum1$TotalBsmtSF, method = c("pearson", "kendall", "spearman"), conf.level = 0.8)

## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 55, df = 1458, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.8082 0.8303
## sample estimates:
##    cor 
## 0.8195

#TotRmsAbvGrd with  GrLivArea
cor.test(dfnum1$TotRmsAbvGrd , dfnum1$GrLivArea, method = c("pearson", "kendall", "spearman"), conf.level = 0.8)

## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 56, df = 1458, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.8145 0.8359
## sample estimates:
##    cor 
## 0.8255

#TGarageArea with GarageCars
cor.test(dfnum1$GarageArea , dfnum1$GarageCars, method = c("pearson", "kendall", "spearman"), conf.level = 0.8)

## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 72, df = 1458, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.8748 0.8897
## sample estimates:
##    cor 
## 0.8825

Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

ans:

-In all three instances the p-values are very small; The p-values of all tests is less than the significance level 0.05. We can conclude that variables are significantly correlated. We can reject the the null hypothesis and conclude that the true correlation is not 0 for the selected variables.

With such low p-values and the correlation matrix already showing how correlated they are, I wouldn’t worried about familywise error in this case but one may be curious about familywise error.

What is Family-wise error?

according to https://www.statisticshowto.com/familywise-error-rate/

The familywise error rate (FWE or FWER) is the probability of a coming to at least one false conclusion in a series of hypothesis tests . In other words, it’s the probability of making at least one Type I Error. The term “familywise” error rate comes from family of tests, which is the technical definition for a series of tests on data.

Formula: The formula to estimate the familywise error rate is: FWER ≤ 1 – (1 – α)^c

Where:

α = alpha level for an individual test (e.g. .05), c = Number of comparisons.

From the equation above, FWER is a function of c which is the number of comparisions & alpha. If an overall type I error rate of 0.05 is what you want, then about 5 comparision tests with an individual reduced alpha rate of 0.01 would be needed, see graph below

#plotting FWE vs c
alpha = 0.01
eq = function(c){1-(1-alpha)^c}
ggplot(data.frame(x=c(1, 10)), aes(x=x)) + 
  stat_function(fun=eq)+theme_economist()+scale_colour_economist()+  xlab("C, Number of comparision tests") +
  ylab("Family wise Error rate %") +
  ggtitle("Family-wise error (FWER) as function of Comparision Tests")+ geom_vline(xintercept = 5, linetype="dotted", 
                color = "red", size=1.5)+ geom_hline(yintercept=0.05, linetype="dotted", color = "red", size=1.5)

Independent variables with high correlation to Sale Price (dependent variable) that is greater than 0.5

cordf2<-as.data.frame(cordf2)
topcors <- cordf2[ which(cordf2$Var2=='SalePrice'),]

topcorsdf<-topcors[order(-abs(topcors$value)),]# sort by highest correlations

cors1<-data.frame(topcorsdf$Var1,topcorsdf$Var2,topcorsdf$value)
cors2<-rename(cors1,Correlation_to_Sale=topcorsdf.value,Independent_variables=topcorsdf.Var1)
kable(cors2)

Independent_variables	topcorsdf.Var2	Correlation_to_Sale
OverallQual	SalePrice	0.7910
GrLivArea	SalePrice	0.7086
GarageCars	SalePrice	0.6404
GarageArea	SalePrice	0.6234
TotalBsmtSF	SalePrice	0.6136
X1stFlrSF	SalePrice	0.6059
FullBath	SalePrice	0.5607
TotRmsAbvGrd	SalePrice	0.5337
YearBuilt	SalePrice	0.5229
YearRemodAdd	SalePrice	0.5071

Correlation Plot of Sale Price w.r.t. Independent Variables greater than 0.5

p<-ggplot(data=cors2, aes(x=cors2$Independent_variables, y=cors2$Correlation_to_Sale,fill=cors2$Independent_variable)) +
  geom_bar(stat="identity")+ggtitle("Fig1: Top ten Correlation Plot of Sale Price vs. Independent Variables")

# Horizontal bar plot
p + coord_flip()+theme_economist()+scale_colour_economist()

Provide a scatterplot matrix for at least two of the independent variables and the dependent variable

Using the two highest correlation; OverallQual & GrLivArea respectively from Fig 1

ggplot(dfnum1, aes(x=OverallQual, y=SalePrice)) + geom_point(shape=1) +
    geom_smooth(method=lm,   # Add linear regression lines
                se=T,    # Don't add shaded confidence region
                fullrange=TRUE)+theme_economist()+scale_colour_economist() +ggtitle("Fig2a: Scatter plot of Overal Qual vs Sales price")

ggplot(dfnum1, aes(x=GrLivArea, y=SalePrice)) + geom_point(shape=1) +
    geom_smooth(method=lm,   # Add linear regression lines
                se=T,    # Don't add shaded confidence region
                fullrange=TRUE)+theme_economist()+scale_colour_economist() +ggtitle("Fig2b: Scatter plot of GrLivArea vs Sales price")

Linear Algebra and Correlation Matrix Decomposition exercise.

Invert correlation matrix
Multiply the correlation matrix by the Inverted matrix
Multiply the precision matrix by the correlation matrix
LU Decomposition

#inverse of correlation matrix
correlation_inverted = solve(correlation)
correlation_inverted

##              GarageCars GrLivArea GarageArea TotRmsAbvGrd TotalBsmtSF X1stFlrSF
## GarageCars       4.6265   -0.1360    -3.9723     -0.26552     -0.0630   0.15041
## GrLivArea       -0.1360    4.1111    -0.3814     -2.85138     -0.1918  -0.75554
## GarageArea      -3.9723   -0.3814     4.8783      0.31841     -0.3269  -0.29078
## TotRmsAbvGrd    -0.2655   -2.8514     0.3184      3.26498      0.3969  -0.08771
## TotalBsmtSF     -0.0630   -0.1918    -0.3269      0.39692      3.1998  -2.48847
## X1stFlrSF        0.1504   -0.7555    -0.2908     -0.08771     -2.4885   3.57928

#multiply the precision matrix by the correlation matrix
correlation_inverted %*% correlation

##              GarageCars  GrLivArea GarageArea TotRmsAbvGrd TotalBsmtSF
## GarageCars    1.000e+00 -6.939e-17  1.665e-16   -4.163e-17  -2.776e-17
## GrLivArea    -3.331e-16  1.000e+00 -2.220e-16    3.886e-16  -1.110e-16
## GarageArea    2.498e-16  5.551e-17  1.000e+00    1.110e-16   2.776e-17
## TotRmsAbvGrd -2.012e-16 -2.637e-16  8.327e-17    1.000e+00  -2.776e-17
## TotalBsmtSF   4.441e-16  2.220e-16  4.441e-16    0.000e+00   1.000e+00
## X1stFlrSF     0.000e+00  0.000e+00  0.000e+00    0.000e+00  -4.441e-16
##               X1stFlrSF
## GarageCars    1.110e-16
## GrLivArea     2.220e-16
## GarageArea   -5.551e-17
## TotRmsAbvGrd -3.192e-16
## TotalBsmtSF   8.882e-16
## X1stFlrSF     1.000e+00

#multiply the correlation matrix by the precision matrix
correlation %*% correlation_inverted

##              GarageCars GrLivArea GarageArea TotRmsAbvGrd TotalBsmtSF X1stFlrSF
## GarageCars    1.000e+00 0.000e+00  3.053e-16   -1.388e-17   0.000e+00         0
## GrLivArea    -3.192e-16 1.000e+00  1.110e-16    1.457e-16   0.000e+00         0
## GarageArea   -2.914e-16 5.551e-17  1.000e+00   -1.735e-16   0.000e+00         0
## TotRmsAbvGrd -2.429e-16 5.551e-17  1.943e-16    1.000e+00   0.000e+00         0
## TotalBsmtSF  -3.053e-16 0.000e+00  3.608e-16    1.249e-16   1.000e+00         0
## X1stFlrSF    -3.886e-16 2.220e-16  2.776e-16    5.551e-17   4.441e-16         1

Observation 2:

The end result of all that manipulation is an Identity Matrix

Now perform LU decompostion

A <-correlation # let the correlation be assinged to matrix A
luDec <- lu(A)
L <- expand(luDec)$L
L #lower triangle matrix

## 6 x 6 Matrix of class "dtrMatrix" (unitriangular)
##      [,1]     [,2]     [,3]     [,4]     [,5]     [,6]    
## [1,]  1.00000        .        .        .        .        .
## [2,]  0.46725  1.00000        .        .        .        .
## [3,]  0.88248  0.07249  1.00000        .        .        .
## [4,]  0.36229  0.83949 -0.13566  1.00000        .        .
## [5,]  0.43458  0.32214  0.39102 -0.22858  1.00000        .
## [6,]  0.43932  0.46151  0.34977 -0.13442  0.69524  1.00000

U <- expand(luDec)$U
U#Mupper triangle matrix

## 6 x 6 Matrix of class "dtrMatrix"
##      [,1]     [,2]     [,3]     [,4]     [,5]     [,6]    
## [1,]  1.00000  0.46725  0.88248  0.36229  0.43458  0.43932
## [2,]        .  0.78168  0.05666  0.65621  0.25181  0.36075
## [3,]        .        .  0.21713 -0.02946  0.08490  0.07594
## [4,]        .        .        .  0.31387 -0.07175 -0.04219
## [5,]        .        .        .        .  0.68042  0.47306
## [6,]        .        .        .        .        .  0.27939

 P <- expand(luDec)$P
P#sparse matrix

## 6 x 6 sparse Matrix of class "pMatrix"
##                 
## [1,] | . . . . .
## [2,] . | . . . .
## [3,] . . | . . .
## [4,] . . . | . .
## [5,] . . . . | .
## [6,] . . . . . |

L %*% U

## 6 x 6 Matrix of class "dgeMatrix"
##        [,1]   [,2]   [,3]   [,4]   [,5]   [,6]
## [1,] 1.0000 0.4672 0.8825 0.3623 0.4346 0.4393
## [2,] 0.4672 1.0000 0.4690 0.8255 0.4549 0.5660
## [3,] 0.8825 0.4690 1.0000 0.3378 0.4867 0.4898
## [4,] 0.3623 0.8255 0.3378 1.0000 0.2856 0.4095
## [5,] 0.4346 0.4549 0.4867 0.2856 1.0000 0.8195
## [6,] 0.4393 0.5660 0.4898 0.4095 0.8195 1.0000

Going back to the original identity A = PLU we can recover A (compare with A, the original correlation Matrix above):

P %*% L %*% U # the LU decomposition

## 6 x 6 Matrix of class "dgeMatrix"
##        [,1]   [,2]   [,3]   [,4]   [,5]   [,6]
## [1,] 1.0000 0.4672 0.8825 0.3623 0.4346 0.4393
## [2,] 0.4672 1.0000 0.4690 0.8255 0.4549 0.5660
## [3,] 0.8825 0.4690 1.0000 0.3378 0.4867 0.4898
## [4,] 0.3623 0.8255 0.3378 1.0000 0.2856 0.4095
## [5,] 0.4346 0.4549 0.4867 0.2856 1.0000 0.8195
## [6,] 0.4393 0.5660 0.4898 0.4095 0.8195 1.0000

correlation #the original correlation matrix, which was assinged to A

##              GarageCars GrLivArea GarageArea TotRmsAbvGrd TotalBsmtSF X1stFlrSF
## GarageCars       1.0000    0.4672     0.8825       0.3623      0.4346    0.4393
## GrLivArea        0.4672    1.0000     0.4690       0.8255      0.4549    0.5660
## GarageArea       0.8825    0.4690     1.0000       0.3378      0.4867    0.4898
## TotRmsAbvGrd     0.3623    0.8255     0.3378       1.0000      0.2856    0.4095
## TotalBsmtSF      0.4346    0.4549     0.4867       0.2856      1.0000    0.8195
## X1stFlrSF        0.4393    0.5660     0.4898       0.4095      0.8195    1.0000

Calculus-Based Probability & Statistics.

Scanning thru the independent variables, there wasn’t one with values less than zero but several that were right-skewed. I picked SalePrice to be the independent variable as case study

#plot of histogram of year built variable
hist(dfnum1$SalePrice , col = 'yellow', main = 'Fig3a: SalePrice- One of Indpendent variables to Investigate', breaks = 30)

summary(dfnum1$SalePrice ) # original distribution of saleprice

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129975  163000  180921  214000  755000

SalesPrice do not contain negative values, therefore, no shifting required

library(MASS)# load the MASS library per instruction

edist<-fitdistr(dfnum1$SalePrice, densfun = 'exponential')
lambda <- edist$estimate
exp_distibution <- rexp(1000, lambda)

summary(exp_distibution)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      15   54796  125590  184969  252524 2025507

#Optimal lambda value
print("The fitted optimal lambda based on N = 1000 samples")

## [1] "The fitted optimal lambda based on N = 1000 samples"

lambda

##      rate 
## 5.527e-06

Plot the fitted exponential histogram (Fig3b) and compare it with a histogram of the original variable (Fig3a)

par(mfrow=c(1,2))
#original histogram without exponential fitting
hist(dfnum1$SalePrice , col = 'yellow', main = 'Fig3a: SalePrice-The original SalePrice', breaks = 30)
#higtogram with exponential fitting
hist(exp_distibution, freq = FALSE, breaks = 100, main ="Fig3b: Fitted Exponential PDF SalePrice",col = "blue", xlim = c(1, quantile(exp_distibution, 0.99)))
curve(dexp(x, rate = lambda), col = "red", add = TRUE)

#Generating the 5th and 95th percentiles
print("The 95% confidence interval of exponential distribution with the optimal lambda")

## [1] "The 95% confidence interval of exponential distribution with the optimal lambda"

qexp(c(0.05,0.95), lambda)

## [1]   9280 541991

#Generating 95% confidence level from the data, assuming that the distribution is normal
print("The 95% confidence interval of original sales price distribution assuming normality")

## [1] "The 95% confidence interval of original sales price distribution assuming normality"

qnorm(c(0.05, 0.95), mean = mean(dfnum1$SalePrice), sd = sd(dfnum1$SalePrice))

## [1]  50250 311592

#the empirical 5th percentile and 95th percentile of the data
print("The empirically generated 95% confidence interval of actual sales price data")

## [1] "The empirically generated 95% confidence interval of actual sales price data"

quantile(dfnum1$SalePrice, c(0.05, 0.95))

##     5%    95% 
##  88000 326100

Let’s break this down one item at a time:

The original salesprice distribution had a mean of 180921 => lambda(original sample) = 5.530e-06
The mean from the exponential distribution from sample size of 1000 was found to be 174952 => lambda(exponential) = 5.527e-06

Given the 2 lambdas are approximately the same, we can say 5.527e-06 was the optimal lambda

Based on these samples, the best estimate would be the sample mean (in this case is 180921). That serves as a good point of estimate but it would be useful to also communicate how uncertain we are of that estimate.

This can be captured by using a confidence interval. We showed the calculated 95% confidence intervals (3 ways) for the sample mean by adding and subtracting 1.96 standard errors to the point estimate.

This is an important inference that we’ve just made: even though we don’t know the full population, we’re 95% confident that the true mean as estimated by the sample means of 180921 lies between any of the 3 confidence intervals shown above.

Modeling in Kaggle

For the multiregression to be most impactful, lets utilize some of the information we already up to this point
Certain independent variables (predictor variables) might have multi-collinearity, so we will eliminate those if we encounter it using VIF > 5 as a guide
We will only take predictor variables that exibibit high correlation (> 0.5) to Sales prices into the model

#these predictors were found to have high correlation to sales price, see Fig1
headers1 = c("GarageArea","GrLivArea","FullBath","OverallQual","TotalBsmtSF","YearBuilt","YearRemodAdd")
# This is to ensure we do not incidently picked up predictors with high VIF
newdata1 <- dfnum[headers1]
correlation1 = cor(newdata1 )
print("Correlational relationships of Predictors with each other")

## [1] "Correlational relationships of Predictors with each other"

correlation1

##              GarageArea GrLivArea FullBath OverallQual TotalBsmtSF YearBuilt
## GarageArea       1.0000    0.4690   0.4057      0.5620      0.4867    0.4790
## GrLivArea        0.4690    1.0000   0.6300      0.5930      0.4549    0.1990
## FullBath         0.4057    0.6300   1.0000      0.5506      0.3237    0.4683
## OverallQual      0.5620    0.5930   0.5506      1.0000      0.5378    0.5723
## TotalBsmtSF      0.4867    0.4549   0.3237      0.5378      1.0000    0.3915
## YearBuilt        0.4790    0.1990   0.4683      0.5723      0.3915    1.0000
## YearRemodAdd     0.3716    0.2874   0.4390      0.5507      0.2911    0.5929
##              YearRemodAdd
## GarageArea         0.3716
## GrLivArea          0.2874
## FullBath           0.4390
## OverallQual        0.5507
## TotalBsmtSF        0.2911
## YearBuilt          0.5929
## YearRemodAdd       1.0000

#fit the regression model
model <- lm(SalePrice ~ GarageArea+ GrLivArea + FullBath + OverallQual+TotalBsmtSF+YearBuilt+YearRemodAdd ,data = dfnum1)
#view the output of the regression model
summary(model)

## 
## Call:
## lm(formula = SalePrice ~ GarageArea + GrLivArea + FullBath + 
##     OverallQual + TotalBsmtSF + YearBuilt + YearRemodAdd, data = dfnum1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -519713  -19055   -2564   14943  289099 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.21e+06   1.28e+05   -9.50  < 2e-16 ***
## GarageArea    4.56e+01   6.14e+00    7.41  2.1e-13 ***
## GrLivArea     5.40e+01   3.01e+00   17.96  < 2e-16 ***
## FullBath     -5.45e+03   2.66e+03   -2.05     0.04 *  
## OverallQual   1.98e+04   1.18e+03   16.77  < 2e-16 ***
## TotalBsmtSF   2.79e+01   2.88e+00    9.68  < 2e-16 ***
## YearBuilt     2.84e+02   4.98e+01    5.70  1.5e-08 ***
## YearRemodAdd  2.98e+02   6.40e+01    4.65  3.6e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38100 on 1452 degrees of freedom
## Multiple R-squared:  0.771,  Adjusted R-squared:  0.769 
## F-statistic:  696 on 7 and 1452 DF,  p-value: <2e-16

#calculate the VIF for each predictor variable in the model
 VIF <- function(linear.model, no.intercept=FALSE, all.diagnostics=FALSE, plot=FALSE) {
    require(mctest)
    if(no.intercept==FALSE) design.matrix <- model.matrix(linear.model)[,-1]
    if(no.intercept==TRUE) design.matrix <- model.matrix(linear.model)
    if(plot==TRUE) mc.plot(design.matrix,linear.model$model[1])
    if(all.diagnostics==FALSE) output <- imcdiag(design.matrix,linear.model$model[1], method='VIF')$idiags[,1]
    if(all.diagnostics==TRUE) output <- imcdiag(design.matrix,linear.model$model[1])
    output
 }


VIF(model)

## Loading required package: mctest

##   GarageArea    GrLivArea     FullBath  OverallQual  TotalBsmtSF    YearBuilt 
##        1.730        2.503        2.148        2.662        1.599        2.268 
## YearRemodAdd 
##        1.750

#create vector of VIF values
vif_values <- VIF(model)

#create horizontal bar chart to display each VIF value
barplot(vif_values, main = "Fig4: VIF Values on Model 1", horiz = TRUE, col = "gold",las=2,cex.names=.53,xlim=c(0,5))
#add vertical line at 5
abline(v = 5, lwd = 3, lty = 2)

- The above VIF analysis and the correlation table showed the 7 predictors chosen do not have significant multi-collinearity (low VIF < 5) and therefore would make good predictors for the dependent variable

Statistical Diagnostics

Check for Homogeneity of variances of residuals
Check for normality of residuals
Histogram of residuals

The reason for these set of tests is to ensure the model used is correctly specified

# Compute the analysis of variance
res.aov <- aov(model, data = dfnum1)
# Summary of the analysis
summary(res.aov)

##                Df   Sum Sq  Mean Sq F value  Pr(>F)    
## GarageArea      1 3.58e+12 3.58e+12  2459.2 < 2e-16 ***
## GrLivArea       1 2.05e+12 2.05e+12  1405.3 < 2e-16 ***
## FullBath        1 8.38e+10 8.38e+10    57.6 5.8e-14 ***
## OverallQual     1 1.10e+12 1.10e+12   757.1 < 2e-16 ***
## TotalBsmtSF     1 1.67e+11 1.67e+11   114.7 < 2e-16 ***
## YearBuilt       1 8.70e+10 8.70e+10    59.8 2.0e-14 ***
## YearRemodAdd    1 3.15e+10 3.15e+10    21.6 3.6e-06 ***
## Residuals    1452 2.11e+12 1.46e+09                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# 1. Homogeneity of variances check
plot(res.aov, 1)

# 2. Normality of residual check
plot(res.aov, 2)

#histogram of residuals
ggplot(model, aes(x=.resid))+geom_histogram(binwidth = 30)+ geom_histogram(bins=30)

Summary of statistical diagnostics & overall evaluation on Model 1:

Apparently, the adjusted \(R^{2}\) of 0.771 and F-stats with low p-value seems to indicate a model with high predictive power
All the coeficients of the model have statistical significance since their T-Stats are > 2 with corresponding low p-values (<0.05)
The residuals of the model looks awful though; (A) The residuals exhibit heterodasticity (B) Normality of residuals is Not met (C) Histogram of residuals revealed a few outliers as well (May or may not need corrective actions)

Overall, model 1 is not entirely correctly specified. Its good but not great.

The prescribed solution is to performed some data transforms on some or all of the predictor variables to improve the model

Let’s see what happens next?

From our initial box and density plots, we can see which prescription to apply to the predictor variables:

GRLiv Area, Total BSMASF exibited right skewness
Year Built, YearRemoldAdd exibited left skewness
Garage area, FullBath are Bimodal data sets

let’s start with the obvious first; taking logs of the data sets. Let’s review them and see if we still need to keep them or not

Transformed the predictors by taking the logs

#headers1 = c("GarageCars","GrLivArea","FullBath","OverallQual","TotalBsmtSF","YearBuilt","YearRemodAdd")
dfnum2<-fortify(dfnum1)
GrLivArealog= log(dfnum2$GrLivArea+min(dfnum2$GrLivArea))
TotalBsmtSFlog= log(dfnum2$TotalBsmtSF)
YearBuiltlog=log(dfnum2$YearBuilt)
YearRemodAddlog=log(dfnum2$YearRemodAdd)
Garagearealog=log(dfnum2$GarageArea)
#FullBathlog=log(dfnum2$FullBath)

Plotting the transformed data

par(mfrow=c(3,3))
hist(GrLivArealog , col = 'yellow', main = 'Fig5a: Log of GrLivArea', breaks = 30)
hist(TotalBsmtSFlog , col = 'green', main = 'Fig5b: Log of TotalBsmtSF', breaks = 30)
hist(YearBuiltlog, col = 'red', main = 'Fig5c: Log of YearBuilt', breaks = 30)
hist(YearRemodAddlog , col = 'orange', main = 'Fig5e: Log of YearRemodAdd', breaks = 30)
hist(Garagearealog , col = 'darkorchid2', main = 'Fig5f: Log of Garagearea', breaks = 30)

Taking the logs of YearRemodlAdd, Yearsbuilt and FullBath did not look promising.  So I decided to drop them as predictors in my next model

#fit the regression model again but with transformed data
 
#dfnum3<-dfnum2 %>% mutate(TotalBsmtSFlog=log(dfnum2$TotalBsmtSF),Garagearealog=log(dfnum2$GarageArea),GrLivArealog=log(dfnum2$GrLivArea))
#model1 <- lm((SalePrice)~ TotalBsmtSFlog+Garagearealog+GrLivArealog+OverallQual,data = dfnum3,na.action=na.exclude)
#view the output of the regression model with transformed data
#summary(model1)

I was getting this error: Error in lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) : NA/NaN/Inf in 'x' after taking the log of the predictors

Corrective Actions:

I then added 1 to my logs to avoid the error, but the results did not improve at all. I was back to square one; a model that is still poorly specified, see statistical diagnostics below

dfnum3<-dfnum2 %>% mutate(TotalBsmtSFlog=log(dfnum2$TotalBsmtSF+1),Garagearealog=log(dfnum2$GarageArea+1),GrLivArealog=log(dfnum2$GrLivArea+1))
#view the output of the regression model with transformed data
model1 <- lm((SalePrice)~ TotalBsmtSFlog+Garagearealog+GrLivArealog+OverallQual,data = dfnum3,na.action=na.exclude)
summary(model1)

## 
## Call:
## lm(formula = (SalePrice) ~ TotalBsmtSFlog + Garagearealog + GrLivArealog + 
##     OverallQual, data = dfnum3, na.action = na.exclude)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -263785  -24419   -2463   19308  357747 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -628841      28513  -22.05  < 2e-16 ***
## TotalBsmtSFlog     4123       1058    3.90  0.00010 ***
## Garagearealog      3108        848    3.66  0.00026 ***
## GrLivArealog      78772       4339   18.15  < 2e-16 ***
## OverallQual       31368       1116   28.11  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 43400 on 1455 degrees of freedom
## Multiple R-squared:  0.702,  Adjusted R-squared:  0.701 
## F-statistic:  858 on 4 and 1455 DF,  p-value: <2e-16

all(is.na(dfnum3$TotalBsmtSFlog)) #check to see if we have NA's

## [1] FALSE

res.aov1 <- aov(model1, data = dfnum3)
# Summary of the analysis
summary(res.aov1)

##                  Df   Sum Sq  Mean Sq F value Pr(>F)    
## TotalBsmtSFlog    1 9.78e+11 9.78e+11     519 <2e-16 ***
## Garagearealog     1 9.37e+11 9.37e+11     497 <2e-16 ***
## GrLivArealog      1 3.06e+12 3.06e+12    1625 <2e-16 ***
## OverallQual       1 1.49e+12 1.49e+12     790 <2e-16 ***
## Residuals      1455 2.74e+12 1.88e+09                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

plot(res.aov1, 2)

plot(res.aov1, 1)

Stepwise Regression

Given the problems with misspecifications on Model 1 with and without log transforms, I decided to include all the possible numerical predictors from the training set.
Then performed a step-wise regression to remove any insignificant predictors to the model

#setting a "full model" with all the numeric predictors in it
full.model <- lm(SalePrice ~ MSSubClass + LotFrontage + LotArea + OverallQual 
                       + YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 +
                      BsmtFinSF2  + X1stFlrSF + X2ndFlrSF  +
                      BsmtFullBath + BedroomAbvGr  +
                      TotRmsAbvGrd + Fireplaces + GarageCars + WoodDeckSF +
                      PoolArea, data = dfnum3)
# Stepwise regression model- in both directions
step.model <- stepAIC(full.model, direction = "both", 
                      trace = FALSE,k=2)
summary(step.model)

## 
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotArea + OverallQual + 
##     YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 + X1stFlrSF + 
##     X2ndFlrSF + BsmtFullBath + BedroomAbvGr + TotRmsAbvGrd + 
##     Fireplaces + GarageCars + WoodDeckSF, data = dfnum3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -484184  -16921   -1841   14129  284221 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.04e+06   1.15e+05   -9.00  < 2e-16 ***
## MSSubClass   -1.99e+02   2.41e+01   -8.25  3.4e-16 ***
## LotArea       4.25e-01   1.01e-01    4.20  2.8e-05 ***
## OverallQual   1.97e+04   1.11e+03   17.76  < 2e-16 ***
## YearBuilt     2.02e+02   4.51e+01    4.48  8.2e-06 ***
## YearRemodAdd  3.00e+02   6.02e+01    4.99  6.9e-07 ***
## MasVnrArea    3.22e+01   5.93e+00    5.44  6.2e-08 ***
## BsmtFinSF1    1.28e+01   2.99e+00    4.30  1.8e-05 ***
## X1stFlrSF     5.16e+01   4.59e+00   11.24  < 2e-16 ***
## X2ndFlrSF     4.71e+01   4.09e+00   11.50  < 2e-16 ***
## BsmtFullBath  7.80e+03   2.40e+03    3.25   0.0012 ** 
## BedroomAbvGr -8.91e+03   1.67e+03   -5.33  1.1e-07 ***
## TotRmsAbvGrd  4.04e+03   1.18e+03    3.44   0.0006 ***
## Fireplaces    4.83e+03   1.72e+03    2.81   0.0050 ** 
## GarageCars    1.03e+04   1.71e+03    6.04  2.0e-09 ***
## WoodDeckSF    2.58e+01   7.88e+00    3.28   0.0011 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35200 on 1444 degrees of freedom
## Multiple R-squared:  0.806,  Adjusted R-squared:  0.804 
## F-statistic:  400 on 15 and 1444 DF,  p-value: <2e-16

#Note1: FullBath ,BsmtUnfSF & LowQualFinSF predictors were removed as it does not contribute significantly to the model
#Note2: KitchenAbvGr & OverallCond had low statistically insignificant F-values

Statistical Diagnostics Post-Step Wise Regression

res.aov3 <- aov(step.model, data = dfnum3)
# Summary of the analysis
summary(res.aov3)

##                Df   Sum Sq  Mean Sq F value  Pr(>F)    
## MSSubClass      1 6.54e+10 6.54e+10   52.86 5.9e-13 ***
## LotArea         1 5.97e+11 5.97e+11  482.16 < 2e-16 ***
## OverallQual     1 5.47e+12 5.47e+12 4418.09 < 2e-16 ***
## YearBuilt       1 8.54e+10 8.54e+10   69.02 < 2e-16 ***
## YearRemodAdd    1 3.74e+10 3.74e+10   30.23 4.5e-08 ***
## MasVnrArea      1 2.17e+11 2.17e+11  175.45 < 2e-16 ***
## BsmtFinSF1      1 1.68e+11 1.68e+11  135.55 < 2e-16 ***
## X1stFlrSF       1 1.79e+11 1.79e+11  144.54 < 2e-16 ***
## X2ndFlrSF       1 4.65e+11 4.65e+11  375.61 < 2e-16 ***
## BsmtFullBath    1 1.91e+10 1.91e+10   15.47 8.8e-05 ***
## BedroomAbvGr    1 3.12e+10 3.12e+10   25.21 5.8e-07 ***
## TotRmsAbvGrd    1 1.87e+10 1.87e+10   15.13 0.00011 ***
## Fireplaces      1 1.17e+10 1.17e+10    9.41 0.00219 ** 
## GarageCars      1 4.60e+10 4.60e+10   37.14 1.4e-09 ***
## WoodDeckSF      1 1.33e+10 1.33e+10   10.73 0.00108 ** 
## Residuals    1444 1.79e+12 1.24e+09                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

plot(res.aov3, 2)

plot(res.aov3, 1)

ggplot(step.model, aes(x=.resid))+geom_histogram(binwidth = 30)+ geom_histogram(bins=30)

#Check for VIFs
VIF(step.model)

##   MSSubClass      LotArea  OverallQual    YearBuilt YearRemodAdd   MasVnrArea 
##        1.223        1.201        2.777        2.188        1.820        1.353 
##   BsmtFinSF1    X1stFlrSF    X2ndFlrSF BsmtFullBath BedroomAbvGr TotRmsAbvGrd 
##        2.187        3.713        3.768        1.825        2.194        4.304 
##   Fireplaces   GarageCars   WoodDeckSF 
##        1.447        1.918        1.149

#create vector of VIF values
vif_values1 <- VIF(step.model)

#create horizontal bar chart to display and check each VIF values again after step wise regression
barplot(vif_values1, main = "Fig6: VIF Values after step-wise regression", horiz = TRUE, col = "turquoise1",las=2,cex.names=.53,xlim=c(0,5))
#add vertical line at 5
abline(v = 5, lwd = 3, lty = 2)

#check if outliers needs our attention or not
outlierTest(step.model)

##      rstudent unadjusted p-value Bonferroni p
## 1299  -16.522          2.687e-56    3.923e-53
## 524   -10.337          3.267e-24    4.770e-21
## 1183    8.509          4.313e-17    6.298e-14
## 692     8.128          9.317e-16    1.360e-12
## 804     6.923          6.613e-12    9.655e-09
## 899     6.268          4.827e-10    7.048e-07
## 1047    6.154          9.790e-10    1.429e-06
## 1170    4.790          1.841e-06    2.688e-03
## 441     4.706          2.766e-06    4.039e-03

Summary of Statistical diagnostic & evaluation of Model2 (Model with all numerical predictors Post Step-wise Regression)

1) Slope of Coeficents are still signifcant T-stat > 2 with corresponding low p-values (<0.05)
2) F-values exhibit statistical significance
3) VIF values of predictors are all lower than 5
4) The adjusted R-Square of 0.804 is a marked improvement from Model 1 which was at 0.771
5) The residuals of model2 looks better though; (A) The residuals exhibit some heterodasticity but the curvature is "less" as predictor values increases (B) Normality of residuals is not perfect but saw a marked improvement (C) Histogram of residuals revealed a few outliers still and a significance of Outlier Tests were performed to see if it needed corrective actions
6)Several identified outliers had low p values which indicated that they do not seem to be significant (see last graph)

Lets apply to our final Model 2 the test data and make some predictions

# Read in from Kaggle the test data set
url2<- "https://raw.githubusercontent.com/ssufian/DAT-605/master/test.csv"
test_df <- read.csv(url2, header = TRUE, sep = ",",stringsAsFactors = FALSE)
#replace NA with zeros for numeric columns like in Model 1
test_df1<-test_df %>% mutate_all(~replace(., is.na(.), 0))
#Filter Test Data from Kaggle to reflect only the numerical predictors that we had used in the training set
test_df2<- dplyr::select(test_df1,Id,MSSubClass,LotFrontage,LotArea,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,X1stFlrSF,X2ndFlrSF ,BsmtFullBath ,BedroomAbvGr,TotRmsAbvGrd , Fireplaces ,GarageCars,WoodDeckSF,PoolArea)

#Make predictions with selected predictors
test_results<-predict(step.model, test_df2)
head(test_results) #checking at results

##      1      2      3      4      5      6 
## 113289 165950 175587 198751 189161 184604

#Making columns per Kaggle's instructions
prediction <- data.frame(Id = test_df2[,"Id"],  SalePrice = test_results)

head(prediction, 10)

##      Id SalePrice
## 1  1461    113289
## 2  1462    165950
## 3  1463    175587
## 4  1464    198751
## 5  1465    189161
## 6  1466    184604
## 7  1467    194618
## 8  1468    173252
## 9  1469    216060
## 10 1470    116969

Write into a csv to submit to Kaggle

write.csv(prediction, "salepricepredictions.csv")

Results from Kaggle

My Kaggle user name: s_sufian@hotmailcom

YouTube link: https://youtu.be/nZtMCDD_918

DAT 605 FINAL Spring 2020

sufian

5/2/2020