library(dplyr)
library(knitr)
library(moments)
library(kableExtra)
library(reshape2)
library(ggplot2)
library(tidyr)
library(corrplot)
library(MASS)
library(pastecs)
library(psych)
library(e1071)
library(fBasics)
library(mosaic)
library(ggplot2)
require(ggthemes)
library(plogr)
require(Matrix)
library(onehot)
library(data.table)
library(mltools)
library(car)
Using R, generate a random variable X that has 10,000 random uniform numbers from 1 to N, where N can be any number of your choosing greater than or equal to 6. Then generate a random variable Y that has 10,000 random normal numbers with a mean of \(\mu\)=\(\sigma \,\)=( N+1)/2.
#let N be 10
N <- 10
set.seed(1)
X <- runif(10000, 1, N)
set.seed(1)
Y <- rnorm(10000, (N+1)/2, (N+1)/2)
xmean <- mean(X)
ymean<-mean(Y)
xmedian <-median(X)
ymedian <-median(Y)
print(paste0("Mean of X is ", round(xmean,2)))
## [1] "Mean of X is 5.5"
print(paste0("Mean of Y is ", round(ymean,2)))
## [1] "Mean of Y is 5.46"
print(paste0("Median of X is ",round(xmedian,2) ))
## [1] "Median of X is 5.46"
print(paste0("Median of Y is ", round( ymedian,2)))
## [1] "Median of Y is 5.41"
#plots of the 2 distribution
hist(X, col='#80cbc4', xlab="X", main="Uniform Distribution of X")
hist(Y, col='green', xlab="Y", main="Normal Distribution of Y")
Probability. Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the median of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities.
#creating dataframe for the uniform distribution of X only
df <- as.data.frame((X))
head(df)
## (X)
## 1 3.389578
## 2 4.349115
## 3 6.155680
## 4 9.173870
## 5 2.815137
## 6 9.085507
#To determine 1st quartile of Y
summary(Y)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -14.692 1.796 5.412 5.464 9.227 26.457
y <- quantile(Y, 0.25)
y
## 25%
## 1.796331
#To determine median or 2nd quartile quartile of X
summary(X)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.001 3.273 5.462 5.502 7.812 9.999
x <- quantile(X, 0.5)
x
## 50%
## 5.461671
options(digits=4)
#because X is a uniform distribution, each little x has equal chance (rectangle distribution)
p1<-nrow(subset(df, X > x & X>y))/nrow(subset(df,X>y))
p1
## [1] 0.5499
options(digits=4)
#For the uniform distribution
Px <- nrow(subset(df, X > x))/nrow(df)
#But since Y is normally distributed, lets find the z score first and use pnorm to obtain the probability of P(Y>y)
z <- ((y-(N+1)/2))/((N+1)/2)
Py <-1-pnorm(z)
# Now apply the formula of P(A)*P(B)
Px*Py
## 25%
## 0.3748
Apply the same principle of conditional probability as part 1a
options(digits=4)
p2<-nrow(subset(df, X < x & X>y))/nrow(subset(df,X>y))
p2
## [1] 0.4501
Investigate whether P(X>x and Y>y)=P(X>x)P(Y>y) by building a table and evaluating the marginal and joint probabilities.
n<-nrow(df)
c1<-nrow(subset(df, X < x & Y<y))/n
c2<-nrow(subset(df, X < x & Y>y))/n
c3<-c1+c2
c4<-nrow(subset(df, X > x & Y<y))/n
c5<-nrow(subset(df, X > x & Y>y))/n
c6<-c4+c5
c7<-c1+c4
c8<-c2+c5
c9<-c3+c6
dfcounts<-matrix(round(c(c1,c2,c3,c4,c5,c6,c7,c8,c9),3), ncol=3, nrow=3, byrow=T)
colnames(dfcounts)<-c(
"Y<y",
"Y>y",
"Total")
rownames(dfcounts)<-c("X<x","X>x","Total")
print("Contingency Table by Percentage")
## [1] "Contingency Table by Percentage"
dfcounts<-as.table(dfcounts)
dfcounts
## Y<y Y>y Total
## X<x 0.126 0.374 0.500
## X>x 0.124 0.376 0.500
## Total 0.250 0.750 1.000
options(digits=4)
#from percentage contigency table above
P_X_greater_x <- 0.5 #marginal across of the table (P(X>x))
P_Y_greater_y<- 0.75 #marginal vertical of the table (P(Y>y))
P_X_greater_x*P_Y_greater_y
## [1] 0.375
P(X>x and Y>y) by using joint probability of the contingency table => 0.376
P(X>x)P(Y>y), by using the marginal probability of the contigency table and multiplying them togetther, see above => 0.375
Ans: both are equal
Check to see if independence holds by using Fisher’s Exact Test and the Chi Square Test. What is the difference between the two? Which is most appropriate?
H0: X>x and Y>y are independent events HA: X>x and Y>y are dependent events
#Converting the percentage contingency table to counts to perform Fisher and Chi Square tests of independence
print("Contingency Table by Counts")
## [1] "Contingency Table by Counts"
dfvals<-round(dfcounts*10000,0)
#subsetting contingency table for only the relevant counts
dfcountssubset <- dfvals[c(1:2),c(1:2)]
fisher.test(dfcountssubset )#Fisher Test
##
## Fisher's Exact Test for Count Data
##
## data: dfcountssubset
## p-value = 0.7
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.9321 1.1195
## sample estimates:
## odds ratio
## 1.022
chisq.test(dfcountssubset)#Chi square test
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: dfcountssubset
## X-squared = 0.19, df = 1, p-value = 0.7
Conclusion:
Both of the p values are higher than 0.05 which means we cannot reject the H0 (X>x and Y>y are independent events). Both tests leads to the same conclusion that they are independent
I read that Fisher test is appropriate for small sample size (<1000). For this exercise, we do not really have a large sample size and we have two nominal variables and we looking to see if the proportions of one are different depending on the other, I would go with Fisher Test. I also read that chi-square is usually more appropriate for categorical variables.
url1<- "https://raw.githubusercontent.com/ssufian/DAT-605/master/train.csv"
train_df <- read.csv(url1, header = TRUE, sep = ",",stringsAsFactors = FALSE)
#replace NA with zeros for numeric columns
dfnum<-train_df %>% mutate_all(~replace(., is.na(.), 0))
#checking dimensions of dataframe
nrow(dfnum)
## [1] 1460
ncol(dfnum)
## [1] 81
#Taking only columns with numerical data
dfnum1<- dfnum[c(2,4,5, 18:21, 27,35, 37:39, 44:53, 55, 57, 60, 62, 63, 67, 72, 76:78, 81)]
df_unistats<- basicStats(dfnum1)[c("Minimum", "Maximum", "1. Quartile", "3. Quartile", "Mean", "Median",
"Variance", "Stdev", "Skewness", "Kurtosis"), ] %>% t() %>% as.data.frame()
kable(df_unistats)
Minimum | Maximum |
|
|
Mean | Median | Variance | Stdev | Skewness | Kurtosis | |
---|---|---|---|---|---|---|---|---|---|---|
MSSubClass | 20 | 190 | 20.0 | 70.0 | 5.690e+01 | 50.0 | 1.789e+03 | 4.230e+01 | 1.4048 | 1.5644 |
LotFrontage | 0 | 313 | 42.0 | 79.0 | 5.762e+01 | 63.0 | 1.202e+03 | 3.466e+01 | 0.2673 | 3.5852 |
LotArea | 1300 | 215245 | 7553.5 | 11601.5 | 1.052e+04 | 9478.5 | 9.963e+07 | 9.981e+03 | 12.1826 | 202.2623 |
OverallQual | 1 | 10 | 5.0 | 7.0 | 6.099e+00 | 6.0 | 1.913e+00 | 1.383e+00 | 0.2165 | 0.0876 |
OverallCond | 1 | 9 | 5.0 | 6.0 | 5.575e+00 | 5.0 | 1.238e+00 | 1.113e+00 | 0.6916 | 1.0929 |
YearBuilt | 1872 | 2010 | 1954.0 | 2000.0 | 1.971e+03 | 1973.0 | 9.122e+02 | 3.020e+01 | -0.6122 | -0.4457 |
YearRemodAdd | 1950 | 2010 | 1967.0 | 2004.0 | 1.985e+03 | 1994.0 | 4.262e+02 | 2.065e+01 | -0.5025 | -1.2744 |
MasVnrArea | 0 | 1600 | 0.0 | 164.2 | 1.031e+02 | 0.0 | 3.266e+04 | 1.807e+02 | 2.6721 | 10.0847 |
BsmtFinSF1 | 0 | 5644 | 0.0 | 712.2 | 4.436e+02 | 383.5 | 2.080e+05 | 4.561e+02 | 1.6820 | 11.0568 |
BsmtFinSF2 | 0 | 1474 | 0.0 | 0.0 | 4.655e+01 | 0.0 | 2.602e+04 | 1.613e+02 | 4.2465 | 20.0089 |
BsmtUnfSF | 0 | 2336 | 223.0 | 808.0 | 5.672e+02 | 477.5 | 1.952e+05 | 4.419e+02 | 0.9184 | 0.4645 |
TotalBsmtSF | 0 | 6110 | 795.8 | 1298.2 | 1.057e+03 | 991.5 | 1.925e+05 | 4.387e+02 | 1.5211 | 13.1789 |
X1stFlrSF | 334 | 4692 | 882.0 | 1391.2 | 1.163e+03 | 1087.0 | 1.495e+05 | 3.866e+02 | 1.3739 | 5.7101 |
X2ndFlrSF | 0 | 2065 | 0.0 | 728.0 | 3.470e+02 | 0.0 | 1.906e+05 | 4.365e+02 | 0.8114 | -0.5590 |
LowQualFinSF | 0 | 572 | 0.0 | 0.0 | 5.845e+00 | 0.0 | 2.364e+03 | 4.862e+01 | 8.9928 | 82.8282 |
GrLivArea | 334 | 5642 | 1129.5 | 1776.8 | 1.515e+03 | 1464.0 | 2.761e+05 | 5.255e+02 | 1.3638 | 4.8635 |
BsmtFullBath | 0 | 3 | 0.0 | 1.0 | 4.253e-01 | 0.0 | 2.693e-01 | 5.189e-01 | 0.5948 | -0.8433 |
BsmtHalfBath | 0 | 2 | 0.0 | 0.0 | 5.750e-02 | 0.0 | 5.700e-02 | 2.388e-01 | 4.0950 | 16.3100 |
FullBath | 0 | 3 | 1.0 | 2.0 | 1.565e+00 | 2.0 | 3.035e-01 | 5.509e-01 | 0.0365 | -0.8612 |
HalfBath | 0 | 2 | 0.0 | 1.0 | 3.829e-01 | 0.0 | 2.529e-01 | 5.029e-01 | 0.6745 | -1.0800 |
BedroomAbvGr | 0 | 8 | 2.0 | 3.0 | 2.866e+00 | 3.0 | 6.655e-01 | 8.158e-01 | 0.2114 | 2.2120 |
KitchenAbvGr | 0 | 3 | 1.0 | 1.0 | 1.047e+00 | 1.0 | 4.850e-02 | 2.203e-01 | 4.4792 | 21.4211 |
TotRmsAbvGrd | 2 | 14 | 5.0 | 7.0 | 6.518e+00 | 6.0 | 2.642e+00 | 1.625e+00 | 0.6750 | 0.8683 |
Fireplaces | 0 | 3 | 0.0 | 1.0 | 6.130e-01 | 1.0 | 4.156e-01 | 6.447e-01 | 0.6482 | -0.2244 |
GarageYrBlt | 0 | 2010 | 1958.0 | 2001.0 | 1.869e+03 | 1977.0 | 2.058e+05 | 4.537e+02 | -3.8616 | 12.9726 |
GarageCars | 0 | 4 | 1.0 | 2.0 | 1.767e+00 | 2.0 | 5.585e-01 | 7.473e-01 | -0.3418 | 0.2117 |
GarageArea | 0 | 1418 | 334.5 | 576.0 | 4.730e+02 | 480.0 | 4.571e+04 | 2.138e+02 | 0.1796 | 0.9045 |
WoodDeckSF | 0 | 857 | 0.0 | 168.0 | 9.424e+01 | 0.0 | 1.571e+04 | 1.253e+02 | 1.5382 | 2.9704 |
PoolArea | 0 | 738 | 0.0 | 0.0 | 2.759e+00 | 0.0 | 1.614e+03 | 4.018e+01 | 14.7979 | 222.1917 |
MiscVal | 0 | 15500 | 0.0 | 0.0 | 4.349e+01 | 0.0 | 2.461e+05 | 4.961e+02 | 24.4265 | 697.6401 |
MoSold | 1 | 12 | 5.0 | 8.0 | 6.322e+00 | 6.0 | 7.310e+00 | 2.704e+00 | 0.2116 | -0.4104 |
YrSold | 2006 | 2010 | 2007.0 | 2009.0 | 2.008e+03 | 2008.0 | 1.764e+00 | 1.328e+00 | 0.0961 | -1.1931 |
SalePrice | 34900 | 755000 | 129975.0 | 214000.0 | 1.809e+05 | 163000.0 | 6.311e+09 | 7.944e+04 | 1.8790 | 6.4968 |
# Box plots
meltData <- melt(dfnum1)
p <- ggplot(meltData, aes(factor(variable), value))
p + geom_boxplot() + facet_wrap(~variable, scale="free")+theme_economist()+scale_colour_economist()
#density plots
density <- dfnum1 %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_density()+theme_economist()+scale_colour_economist()
density
cormatrix<-cor(dfnum1)
cordf<-as.data.frame(cormatrix)
kable(head(cordf))
MSSubClass | LotFrontage | LotArea | OverallQual | OverallCond | YearBuilt | YearRemodAdd | MasVnrArea | BsmtFinSF1 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | X1stFlrSF | X2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | TotRmsAbvGrd | Fireplaces | GarageYrBlt | GarageCars | GarageArea | WoodDeckSF | PoolArea | MiscVal | MoSold | YrSold | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MSSubClass | 1.0000 | -0.2150 | -0.1398 | 0.0326 | -0.0593 | 0.0279 | 0.0406 | 0.0236 | -0.0698 | -0.0656 | -0.1408 | -0.2385 | -0.2518 | 0.3079 | 0.0465 | 0.0749 | 0.0035 | -0.0023 | 0.1316 | 0.1774 | -0.0234 | 0.2817 | 0.0404 | -0.0456 | -0.0810 | -0.0401 | -0.0987 | -0.0126 | 0.0083 | -0.0077 | -0.0136 | -0.0214 | -0.0843 |
LotFrontage | -0.2150 | 1.0000 | 0.1007 | 0.1766 | -0.0535 | 0.0369 | 0.0787 | 0.1050 | 0.0767 | -0.0093 | 0.1608 | 0.2383 | 0.2452 | 0.0425 | 0.0500 | 0.2203 | 0.0105 | -0.0279 | 0.1205 | -0.0130 | 0.1445 | 0.0344 | 0.2214 | 0.0440 | 0.0193 | 0.1652 | 0.2015 | -0.0168 | 0.1141 | -0.0596 | 0.0189 | -0.0121 | 0.2096 |
LotArea | -0.1398 | 0.1007 | 1.0000 | 0.1058 | -0.0056 | 0.0142 | 0.0138 | 0.1033 | 0.2141 | 0.1112 | -0.0026 | 0.2608 | 0.2995 | 0.0510 | 0.0048 | 0.2631 | 0.1582 | 0.0480 | 0.1260 | 0.0143 | 0.1197 | -0.0178 | 0.1900 | 0.2714 | 0.0726 | 0.1549 | 0.1804 | 0.1717 | 0.0777 | 0.0381 | 0.0012 | -0.0143 | 0.2638 |
OverallQual | 0.0326 | 0.1766 | 0.1058 | 1.0000 | -0.0919 | 0.5723 | 0.5507 | 0.4073 | 0.2397 | -0.0591 | 0.3082 | 0.5378 | 0.4762 | 0.2955 | -0.0304 | 0.5930 | 0.1111 | -0.0402 | 0.5506 | 0.2735 | 0.1017 | -0.1839 | 0.4275 | 0.3968 | 0.2890 | 0.6007 | 0.5620 | 0.2389 | 0.0652 | -0.0314 | 0.0708 | -0.0273 | 0.7910 |
OverallCond | -0.0593 | -0.0535 | -0.0056 | -0.0919 | 1.0000 | -0.3760 | 0.0737 | -0.1257 | -0.0462 | 0.0402 | -0.1368 | -0.1711 | -0.1442 | 0.0289 | 0.0255 | -0.0797 | -0.0549 | 0.1178 | -0.1941 | -0.0608 | 0.0130 | -0.0870 | -0.0576 | -0.0238 | -0.0065 | -0.1858 | -0.1515 | -0.0033 | -0.0020 | 0.0688 | -0.0035 | 0.0439 | -0.0779 |
YearBuilt | 0.0279 | 0.0369 | 0.0142 | 0.5723 | -0.3760 | 1.0000 | 0.5929 | 0.3116 | 0.2495 | -0.0491 | 0.1490 | 0.3915 | 0.2820 | 0.0103 | -0.1838 | 0.1990 | 0.1876 | -0.0382 | 0.4683 | 0.2427 | -0.0707 | -0.1748 | 0.0956 | 0.1477 | 0.2720 | 0.5379 | 0.4790 | 0.2249 | 0.0049 | -0.0344 | 0.0124 | -0.0136 | 0.5229 |
cordf[cordf == 1] <- NA #drop correlation of 1, diagonals
cordf[abs(cordf) < 0.5] <- NA # drop correlations of less than 0.5
cordf<-as.matrix(cordf)
cordf2<- na.omit(melt(cordf))
kable(head(cordf2[order(-abs(cordf2$value)),])) # sort by highest correlations
Var1 | Var2 | value | |
---|---|---|---|
852 | GarageArea | GarageCars | 0.8825 |
884 | GarageCars | GarageArea | 0.8825 |
518 | TotRmsAbvGrd | GrLivArea | 0.8255 |
742 | GrLivArea | TotRmsAbvGrd | 0.8255 |
376 | X1stFlrSF | TotalBsmtSF | 0.8195 |
408 | TotalBsmtSF | X1stFlrSF | 0.8195 |
Test the hypotheses that the correlations between each pairwise set of variables (see below) is 0 and provide a 80% confidence interval.
headers = c("GarageCars","GrLivArea","GarageArea","TotRmsAbvGrd","TotalBsmtSF","X1stFlrSF")
newdata <- dfnum[headers]
correlation = cor(newdata )
corrplot(correlation, method="pie")
#hypotheses testing
#X1stFlrSF with TotalBsmtSF
cor.test(dfnum1$X1stFlrSF, dfnum1$TotalBsmtSF, method = c("pearson", "kendall", "spearman"), conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: x and y
## t = 55, df = 1458, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.8082 0.8303
## sample estimates:
## cor
## 0.8195
#TotRmsAbvGrd with GrLivArea
cor.test(dfnum1$TotRmsAbvGrd , dfnum1$GrLivArea, method = c("pearson", "kendall", "spearman"), conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: x and y
## t = 56, df = 1458, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.8145 0.8359
## sample estimates:
## cor
## 0.8255
#TGarageArea with GarageCars
cor.test(dfnum1$GarageArea , dfnum1$GarageCars, method = c("pearson", "kendall", "spearman"), conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: x and y
## t = 72, df = 1458, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.8748 0.8897
## sample estimates:
## cor
## 0.8825
ans:
-In all three instances the p-values are very small; The p-values of all tests is less than the significance level 0.05. We can conclude that variables are significantly correlated. We can reject the the null hypothesis and conclude that the true correlation is not 0 for the selected variables.
With such low p-values and the correlation matrix already showing how correlated they are, I wouldn’t worried about familywise error in this case but one may be curious about familywise error.
according to https://www.statisticshowto.com/familywise-error-rate/
The familywise error rate (FWE or FWER) is the probability of a coming to at least one false conclusion in a series of hypothesis tests . In other words, it’s the probability of making at least one Type I Error. The term “familywise” error rate comes from family of tests, which is the technical definition for a series of tests on data.
Formula: The formula to estimate the familywise error rate is: FWER ≤ 1 – (1 – α)^c
Where:
α = alpha level for an individual test (e.g. .05), c = Number of comparisons.
From the equation above, FWER is a function of c which is the number of comparisions & alpha. If an overall type I error rate of 0.05 is what you want, then about 5 comparision tests with an individual reduced alpha rate of 0.01 would be needed, see graph below
#plotting FWE vs c
alpha = 0.01
eq = function(c){1-(1-alpha)^c}
ggplot(data.frame(x=c(1, 10)), aes(x=x)) +
stat_function(fun=eq)+theme_economist()+scale_colour_economist()+ xlab("C, Number of comparision tests") +
ylab("Family wise Error rate %") +
ggtitle("Family-wise error (FWER) as function of Comparision Tests")+ geom_vline(xintercept = 5, linetype="dotted",
color = "red", size=1.5)+ geom_hline(yintercept=0.05, linetype="dotted", color = "red", size=1.5)
cordf2<-as.data.frame(cordf2)
topcors <- cordf2[ which(cordf2$Var2=='SalePrice'),]
topcorsdf<-topcors[order(-abs(topcors$value)),]# sort by highest correlations
cors1<-data.frame(topcorsdf$Var1,topcorsdf$Var2,topcorsdf$value)
cors2<-rename(cors1,Correlation_to_Sale=topcorsdf.value,Independent_variables=topcorsdf.Var1)
kable(cors2)
Independent_variables | topcorsdf.Var2 | Correlation_to_Sale |
---|---|---|
OverallQual | SalePrice | 0.7910 |
GrLivArea | SalePrice | 0.7086 |
GarageCars | SalePrice | 0.6404 |
GarageArea | SalePrice | 0.6234 |
TotalBsmtSF | SalePrice | 0.6136 |
X1stFlrSF | SalePrice | 0.6059 |
FullBath | SalePrice | 0.5607 |
TotRmsAbvGrd | SalePrice | 0.5337 |
YearBuilt | SalePrice | 0.5229 |
YearRemodAdd | SalePrice | 0.5071 |
p<-ggplot(data=cors2, aes(x=cors2$Independent_variables, y=cors2$Correlation_to_Sale,fill=cors2$Independent_variable)) +
geom_bar(stat="identity")+ggtitle("Fig1: Top ten Correlation Plot of Sale Price vs. Independent Variables")
# Horizontal bar plot
p + coord_flip()+theme_economist()+scale_colour_economist()
ggplot(dfnum1, aes(x=OverallQual, y=SalePrice)) + geom_point(shape=1) +
geom_smooth(method=lm, # Add linear regression lines
se=T, # Don't add shaded confidence region
fullrange=TRUE)+theme_economist()+scale_colour_economist() +ggtitle("Fig2a: Scatter plot of Overal Qual vs Sales price")
ggplot(dfnum1, aes(x=GrLivArea, y=SalePrice)) + geom_point(shape=1) +
geom_smooth(method=lm, # Add linear regression lines
se=T, # Don't add shaded confidence region
fullrange=TRUE)+theme_economist()+scale_colour_economist() +ggtitle("Fig2b: Scatter plot of GrLivArea vs Sales price")
#inverse of correlation matrix
correlation_inverted = solve(correlation)
correlation_inverted
## GarageCars GrLivArea GarageArea TotRmsAbvGrd TotalBsmtSF X1stFlrSF
## GarageCars 4.6265 -0.1360 -3.9723 -0.26552 -0.0630 0.15041
## GrLivArea -0.1360 4.1111 -0.3814 -2.85138 -0.1918 -0.75554
## GarageArea -3.9723 -0.3814 4.8783 0.31841 -0.3269 -0.29078
## TotRmsAbvGrd -0.2655 -2.8514 0.3184 3.26498 0.3969 -0.08771
## TotalBsmtSF -0.0630 -0.1918 -0.3269 0.39692 3.1998 -2.48847
## X1stFlrSF 0.1504 -0.7555 -0.2908 -0.08771 -2.4885 3.57928
#multiply the precision matrix by the correlation matrix
correlation_inverted %*% correlation
## GarageCars GrLivArea GarageArea TotRmsAbvGrd TotalBsmtSF
## GarageCars 1.000e+00 -6.939e-17 1.665e-16 -4.163e-17 -2.776e-17
## GrLivArea -3.331e-16 1.000e+00 -2.220e-16 3.886e-16 -1.110e-16
## GarageArea 2.498e-16 5.551e-17 1.000e+00 1.110e-16 2.776e-17
## TotRmsAbvGrd -2.012e-16 -2.637e-16 8.327e-17 1.000e+00 -2.776e-17
## TotalBsmtSF 4.441e-16 2.220e-16 4.441e-16 0.000e+00 1.000e+00
## X1stFlrSF 0.000e+00 0.000e+00 0.000e+00 0.000e+00 -4.441e-16
## X1stFlrSF
## GarageCars 1.110e-16
## GrLivArea 2.220e-16
## GarageArea -5.551e-17
## TotRmsAbvGrd -3.192e-16
## TotalBsmtSF 8.882e-16
## X1stFlrSF 1.000e+00
#multiply the correlation matrix by the precision matrix
correlation %*% correlation_inverted
## GarageCars GrLivArea GarageArea TotRmsAbvGrd TotalBsmtSF X1stFlrSF
## GarageCars 1.000e+00 0.000e+00 3.053e-16 -1.388e-17 0.000e+00 0
## GrLivArea -3.192e-16 1.000e+00 1.110e-16 1.457e-16 0.000e+00 0
## GarageArea -2.914e-16 5.551e-17 1.000e+00 -1.735e-16 0.000e+00 0
## TotRmsAbvGrd -2.429e-16 5.551e-17 1.943e-16 1.000e+00 0.000e+00 0
## TotalBsmtSF -3.053e-16 0.000e+00 3.608e-16 1.249e-16 1.000e+00 0
## X1stFlrSF -3.886e-16 2.220e-16 2.776e-16 5.551e-17 4.441e-16 1
Observation 2:
A <-correlation # let the correlation be assinged to matrix A
luDec <- lu(A)
L <- expand(luDec)$L
L #lower triangle matrix
## 6 x 6 Matrix of class "dtrMatrix" (unitriangular)
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 1.00000 . . . . .
## [2,] 0.46725 1.00000 . . . .
## [3,] 0.88248 0.07249 1.00000 . . .
## [4,] 0.36229 0.83949 -0.13566 1.00000 . .
## [5,] 0.43458 0.32214 0.39102 -0.22858 1.00000 .
## [6,] 0.43932 0.46151 0.34977 -0.13442 0.69524 1.00000
U <- expand(luDec)$U
U#Mupper triangle matrix
## 6 x 6 Matrix of class "dtrMatrix"
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 1.00000 0.46725 0.88248 0.36229 0.43458 0.43932
## [2,] . 0.78168 0.05666 0.65621 0.25181 0.36075
## [3,] . . 0.21713 -0.02946 0.08490 0.07594
## [4,] . . . 0.31387 -0.07175 -0.04219
## [5,] . . . . 0.68042 0.47306
## [6,] . . . . . 0.27939
P <- expand(luDec)$P
P#sparse matrix
## 6 x 6 sparse Matrix of class "pMatrix"
##
## [1,] | . . . . .
## [2,] . | . . . .
## [3,] . . | . . .
## [4,] . . . | . .
## [5,] . . . . | .
## [6,] . . . . . |
L %*% U
## 6 x 6 Matrix of class "dgeMatrix"
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 1.0000 0.4672 0.8825 0.3623 0.4346 0.4393
## [2,] 0.4672 1.0000 0.4690 0.8255 0.4549 0.5660
## [3,] 0.8825 0.4690 1.0000 0.3378 0.4867 0.4898
## [4,] 0.3623 0.8255 0.3378 1.0000 0.2856 0.4095
## [5,] 0.4346 0.4549 0.4867 0.2856 1.0000 0.8195
## [6,] 0.4393 0.5660 0.4898 0.4095 0.8195 1.0000
P %*% L %*% U # the LU decomposition
## 6 x 6 Matrix of class "dgeMatrix"
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 1.0000 0.4672 0.8825 0.3623 0.4346 0.4393
## [2,] 0.4672 1.0000 0.4690 0.8255 0.4549 0.5660
## [3,] 0.8825 0.4690 1.0000 0.3378 0.4867 0.4898
## [4,] 0.3623 0.8255 0.3378 1.0000 0.2856 0.4095
## [5,] 0.4346 0.4549 0.4867 0.2856 1.0000 0.8195
## [6,] 0.4393 0.5660 0.4898 0.4095 0.8195 1.0000
correlation #the original correlation matrix, which was assinged to A
## GarageCars GrLivArea GarageArea TotRmsAbvGrd TotalBsmtSF X1stFlrSF
## GarageCars 1.0000 0.4672 0.8825 0.3623 0.4346 0.4393
## GrLivArea 0.4672 1.0000 0.4690 0.8255 0.4549 0.5660
## GarageArea 0.8825 0.4690 1.0000 0.3378 0.4867 0.4898
## TotRmsAbvGrd 0.3623 0.8255 0.3378 1.0000 0.2856 0.4095
## TotalBsmtSF 0.4346 0.4549 0.4867 0.2856 1.0000 0.8195
## X1stFlrSF 0.4393 0.5660 0.4898 0.4095 0.8195 1.0000
#plot of histogram of year built variable
hist(dfnum1$SalePrice , col = 'yellow', main = 'Fig3a: SalePrice- One of Indpendent variables to Investigate', breaks = 30)
summary(dfnum1$SalePrice ) # original distribution of saleprice
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34900 129975 163000 180921 214000 755000
library(MASS)# load the MASS library per instruction
edist<-fitdistr(dfnum1$SalePrice, densfun = 'exponential')
lambda <- edist$estimate
exp_distibution <- rexp(1000, lambda)
summary(exp_distibution)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 15 54796 125590 184969 252524 2025507
#Optimal lambda value
print("The fitted optimal lambda based on N = 1000 samples")
## [1] "The fitted optimal lambda based on N = 1000 samples"
lambda
## rate
## 5.527e-06
par(mfrow=c(1,2))
#original histogram without exponential fitting
hist(dfnum1$SalePrice , col = 'yellow', main = 'Fig3a: SalePrice-The original SalePrice', breaks = 30)
#higtogram with exponential fitting
hist(exp_distibution, freq = FALSE, breaks = 100, main ="Fig3b: Fitted Exponential PDF SalePrice",col = "blue", xlim = c(1, quantile(exp_distibution, 0.99)))
curve(dexp(x, rate = lambda), col = "red", add = TRUE)
#Generating the 5th and 95th percentiles
print("The 95% confidence interval of exponential distribution with the optimal lambda")
## [1] "The 95% confidence interval of exponential distribution with the optimal lambda"
qexp(c(0.05,0.95), lambda)
## [1] 9280 541991
#Generating 95% confidence level from the data, assuming that the distribution is normal
print("The 95% confidence interval of original sales price distribution assuming normality")
## [1] "The 95% confidence interval of original sales price distribution assuming normality"
qnorm(c(0.05, 0.95), mean = mean(dfnum1$SalePrice), sd = sd(dfnum1$SalePrice))
## [1] 50250 311592
#the empirical 5th percentile and 95th percentile of the data
print("The empirically generated 95% confidence interval of actual sales price data")
## [1] "The empirically generated 95% confidence interval of actual sales price data"
quantile(dfnum1$SalePrice, c(0.05, 0.95))
## 5% 95%
## 88000 326100
Let’s break this down one item at a time:
The original salesprice distribution had a mean of 180921 => lambda(original sample) = 5.530e-06
The mean from the exponential distribution from sample size of 1000 was found to be 174952 => lambda(exponential) = 5.527e-06
Given the 2 lambdas are approximately the same, we can say 5.527e-06 was the optimal lambda
Based on these samples, the best estimate would be the sample mean (in this case is 180921). That serves as a good point of estimate but it would be useful to also communicate how uncertain we are of that estimate.
This can be captured by using a confidence interval. We showed the calculated 95% confidence intervals (3 ways) for the sample mean by adding and subtracting 1.96 standard errors to the point estimate.
This is an important inference that we’ve just made: even though we don’t know the full population, we’re 95% confident that the true mean as estimated by the sample means of 180921 lies between any of the 3 confidence intervals shown above.
#these predictors were found to have high correlation to sales price, see Fig1
headers1 = c("GarageArea","GrLivArea","FullBath","OverallQual","TotalBsmtSF","YearBuilt","YearRemodAdd")
# This is to ensure we do not incidently picked up predictors with high VIF
newdata1 <- dfnum[headers1]
correlation1 = cor(newdata1 )
print("Correlational relationships of Predictors with each other")
## [1] "Correlational relationships of Predictors with each other"
correlation1
## GarageArea GrLivArea FullBath OverallQual TotalBsmtSF YearBuilt
## GarageArea 1.0000 0.4690 0.4057 0.5620 0.4867 0.4790
## GrLivArea 0.4690 1.0000 0.6300 0.5930 0.4549 0.1990
## FullBath 0.4057 0.6300 1.0000 0.5506 0.3237 0.4683
## OverallQual 0.5620 0.5930 0.5506 1.0000 0.5378 0.5723
## TotalBsmtSF 0.4867 0.4549 0.3237 0.5378 1.0000 0.3915
## YearBuilt 0.4790 0.1990 0.4683 0.5723 0.3915 1.0000
## YearRemodAdd 0.3716 0.2874 0.4390 0.5507 0.2911 0.5929
## YearRemodAdd
## GarageArea 0.3716
## GrLivArea 0.2874
## FullBath 0.4390
## OverallQual 0.5507
## TotalBsmtSF 0.2911
## YearBuilt 0.5929
## YearRemodAdd 1.0000
#fit the regression model
model <- lm(SalePrice ~ GarageArea+ GrLivArea + FullBath + OverallQual+TotalBsmtSF+YearBuilt+YearRemodAdd ,data = dfnum1)
#view the output of the regression model
summary(model)
##
## Call:
## lm(formula = SalePrice ~ GarageArea + GrLivArea + FullBath +
## OverallQual + TotalBsmtSF + YearBuilt + YearRemodAdd, data = dfnum1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -519713 -19055 -2564 14943 289099
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.21e+06 1.28e+05 -9.50 < 2e-16 ***
## GarageArea 4.56e+01 6.14e+00 7.41 2.1e-13 ***
## GrLivArea 5.40e+01 3.01e+00 17.96 < 2e-16 ***
## FullBath -5.45e+03 2.66e+03 -2.05 0.04 *
## OverallQual 1.98e+04 1.18e+03 16.77 < 2e-16 ***
## TotalBsmtSF 2.79e+01 2.88e+00 9.68 < 2e-16 ***
## YearBuilt 2.84e+02 4.98e+01 5.70 1.5e-08 ***
## YearRemodAdd 2.98e+02 6.40e+01 4.65 3.6e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38100 on 1452 degrees of freedom
## Multiple R-squared: 0.771, Adjusted R-squared: 0.769
## F-statistic: 696 on 7 and 1452 DF, p-value: <2e-16
#calculate the VIF for each predictor variable in the model
VIF <- function(linear.model, no.intercept=FALSE, all.diagnostics=FALSE, plot=FALSE) {
require(mctest)
if(no.intercept==FALSE) design.matrix <- model.matrix(linear.model)[,-1]
if(no.intercept==TRUE) design.matrix <- model.matrix(linear.model)
if(plot==TRUE) mc.plot(design.matrix,linear.model$model[1])
if(all.diagnostics==FALSE) output <- imcdiag(design.matrix,linear.model$model[1], method='VIF')$idiags[,1]
if(all.diagnostics==TRUE) output <- imcdiag(design.matrix,linear.model$model[1])
output
}
VIF(model)
## Loading required package: mctest
## GarageArea GrLivArea FullBath OverallQual TotalBsmtSF YearBuilt
## 1.730 2.503 2.148 2.662 1.599 2.268
## YearRemodAdd
## 1.750
#create vector of VIF values
vif_values <- VIF(model)
#create horizontal bar chart to display each VIF value
barplot(vif_values, main = "Fig4: VIF Values on Model 1", horiz = TRUE, col = "gold",las=2,cex.names=.53,xlim=c(0,5))
#add vertical line at 5
abline(v = 5, lwd = 3, lty = 2)
- The above VIF analysis and the correlation table showed the 7 predictors chosen do not have significant multi-collinearity (low VIF < 5) and therefore would make good predictors for the dependent variable
The reason for these set of tests is to ensure the model used is correctly specified
# Compute the analysis of variance
res.aov <- aov(model, data = dfnum1)
# Summary of the analysis
summary(res.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## GarageArea 1 3.58e+12 3.58e+12 2459.2 < 2e-16 ***
## GrLivArea 1 2.05e+12 2.05e+12 1405.3 < 2e-16 ***
## FullBath 1 8.38e+10 8.38e+10 57.6 5.8e-14 ***
## OverallQual 1 1.10e+12 1.10e+12 757.1 < 2e-16 ***
## TotalBsmtSF 1 1.67e+11 1.67e+11 114.7 < 2e-16 ***
## YearBuilt 1 8.70e+10 8.70e+10 59.8 2.0e-14 ***
## YearRemodAdd 1 3.15e+10 3.15e+10 21.6 3.6e-06 ***
## Residuals 1452 2.11e+12 1.46e+09
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 1. Homogeneity of variances check
plot(res.aov, 1)
# 2. Normality of residual check
plot(res.aov, 2)
#histogram of residuals
ggplot(model, aes(x=.resid))+geom_histogram(binwidth = 30)+ geom_histogram(bins=30)
Summary of statistical diagnostics & overall evaluation on Model 1:
Overall, model 1 is not entirely correctly specified. Its good but not great.
The prescribed solution is to performed some data transforms on some or all of the predictor variables to improve the model
let’s start with the obvious first; taking logs of the data sets. Let’s review them and see if we still need to keep them or not
#headers1 = c("GarageCars","GrLivArea","FullBath","OverallQual","TotalBsmtSF","YearBuilt","YearRemodAdd")
dfnum2<-fortify(dfnum1)
GrLivArealog= log(dfnum2$GrLivArea+min(dfnum2$GrLivArea))
TotalBsmtSFlog= log(dfnum2$TotalBsmtSF)
YearBuiltlog=log(dfnum2$YearBuilt)
YearRemodAddlog=log(dfnum2$YearRemodAdd)
Garagearealog=log(dfnum2$GarageArea)
#FullBathlog=log(dfnum2$FullBath)
par(mfrow=c(3,3))
hist(GrLivArealog , col = 'yellow', main = 'Fig5a: Log of GrLivArea', breaks = 30)
hist(TotalBsmtSFlog , col = 'green', main = 'Fig5b: Log of TotalBsmtSF', breaks = 30)
hist(YearBuiltlog, col = 'red', main = 'Fig5c: Log of YearBuilt', breaks = 30)
hist(YearRemodAddlog , col = 'orange', main = 'Fig5e: Log of YearRemodAdd', breaks = 30)
hist(Garagearealog , col = 'darkorchid2', main = 'Fig5f: Log of Garagearea', breaks = 30)
Taking the logs of YearRemodlAdd, Yearsbuilt and FullBath did not look promising. So I decided to drop them as predictors in my next model
#fit the regression model again but with transformed data
#dfnum3<-dfnum2 %>% mutate(TotalBsmtSFlog=log(dfnum2$TotalBsmtSF),Garagearealog=log(dfnum2$GarageArea),GrLivArealog=log(dfnum2$GrLivArea))
#model1 <- lm((SalePrice)~ TotalBsmtSFlog+Garagearealog+GrLivArealog+OverallQual,data = dfnum3,na.action=na.exclude)
#view the output of the regression model with transformed data
#summary(model1)
I was getting this error: Error in lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) : NA/NaN/Inf in 'x' after taking the log of the predictors
dfnum3<-dfnum2 %>% mutate(TotalBsmtSFlog=log(dfnum2$TotalBsmtSF+1),Garagearealog=log(dfnum2$GarageArea+1),GrLivArealog=log(dfnum2$GrLivArea+1))
#view the output of the regression model with transformed data
model1 <- lm((SalePrice)~ TotalBsmtSFlog+Garagearealog+GrLivArealog+OverallQual,data = dfnum3,na.action=na.exclude)
summary(model1)
##
## Call:
## lm(formula = (SalePrice) ~ TotalBsmtSFlog + Garagearealog + GrLivArealog +
## OverallQual, data = dfnum3, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -263785 -24419 -2463 19308 357747
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -628841 28513 -22.05 < 2e-16 ***
## TotalBsmtSFlog 4123 1058 3.90 0.00010 ***
## Garagearealog 3108 848 3.66 0.00026 ***
## GrLivArealog 78772 4339 18.15 < 2e-16 ***
## OverallQual 31368 1116 28.11 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 43400 on 1455 degrees of freedom
## Multiple R-squared: 0.702, Adjusted R-squared: 0.701
## F-statistic: 858 on 4 and 1455 DF, p-value: <2e-16
all(is.na(dfnum3$TotalBsmtSFlog)) #check to see if we have NA's
## [1] FALSE
res.aov1 <- aov(model1, data = dfnum3)
# Summary of the analysis
summary(res.aov1)
## Df Sum Sq Mean Sq F value Pr(>F)
## TotalBsmtSFlog 1 9.78e+11 9.78e+11 519 <2e-16 ***
## Garagearealog 1 9.37e+11 9.37e+11 497 <2e-16 ***
## GrLivArealog 1 3.06e+12 3.06e+12 1625 <2e-16 ***
## OverallQual 1 1.49e+12 1.49e+12 790 <2e-16 ***
## Residuals 1455 2.74e+12 1.88e+09
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(res.aov1, 2)
plot(res.aov1, 1)
#setting a "full model" with all the numeric predictors in it
full.model <- lm(SalePrice ~ MSSubClass + LotFrontage + LotArea + OverallQual
+ YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 +
BsmtFinSF2 + X1stFlrSF + X2ndFlrSF +
BsmtFullBath + BedroomAbvGr +
TotRmsAbvGrd + Fireplaces + GarageCars + WoodDeckSF +
PoolArea, data = dfnum3)
# Stepwise regression model- in both directions
step.model <- stepAIC(full.model, direction = "both",
trace = FALSE,k=2)
summary(step.model)
##
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotArea + OverallQual +
## YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 + X1stFlrSF +
## X2ndFlrSF + BsmtFullBath + BedroomAbvGr + TotRmsAbvGrd +
## Fireplaces + GarageCars + WoodDeckSF, data = dfnum3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -484184 -16921 -1841 14129 284221
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.04e+06 1.15e+05 -9.00 < 2e-16 ***
## MSSubClass -1.99e+02 2.41e+01 -8.25 3.4e-16 ***
## LotArea 4.25e-01 1.01e-01 4.20 2.8e-05 ***
## OverallQual 1.97e+04 1.11e+03 17.76 < 2e-16 ***
## YearBuilt 2.02e+02 4.51e+01 4.48 8.2e-06 ***
## YearRemodAdd 3.00e+02 6.02e+01 4.99 6.9e-07 ***
## MasVnrArea 3.22e+01 5.93e+00 5.44 6.2e-08 ***
## BsmtFinSF1 1.28e+01 2.99e+00 4.30 1.8e-05 ***
## X1stFlrSF 5.16e+01 4.59e+00 11.24 < 2e-16 ***
## X2ndFlrSF 4.71e+01 4.09e+00 11.50 < 2e-16 ***
## BsmtFullBath 7.80e+03 2.40e+03 3.25 0.0012 **
## BedroomAbvGr -8.91e+03 1.67e+03 -5.33 1.1e-07 ***
## TotRmsAbvGrd 4.04e+03 1.18e+03 3.44 0.0006 ***
## Fireplaces 4.83e+03 1.72e+03 2.81 0.0050 **
## GarageCars 1.03e+04 1.71e+03 6.04 2.0e-09 ***
## WoodDeckSF 2.58e+01 7.88e+00 3.28 0.0011 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35200 on 1444 degrees of freedom
## Multiple R-squared: 0.806, Adjusted R-squared: 0.804
## F-statistic: 400 on 15 and 1444 DF, p-value: <2e-16
#Note1: FullBath ,BsmtUnfSF & LowQualFinSF predictors were removed as it does not contribute significantly to the model
#Note2: KitchenAbvGr & OverallCond had low statistically insignificant F-values
res.aov3 <- aov(step.model, data = dfnum3)
# Summary of the analysis
summary(res.aov3)
## Df Sum Sq Mean Sq F value Pr(>F)
## MSSubClass 1 6.54e+10 6.54e+10 52.86 5.9e-13 ***
## LotArea 1 5.97e+11 5.97e+11 482.16 < 2e-16 ***
## OverallQual 1 5.47e+12 5.47e+12 4418.09 < 2e-16 ***
## YearBuilt 1 8.54e+10 8.54e+10 69.02 < 2e-16 ***
## YearRemodAdd 1 3.74e+10 3.74e+10 30.23 4.5e-08 ***
## MasVnrArea 1 2.17e+11 2.17e+11 175.45 < 2e-16 ***
## BsmtFinSF1 1 1.68e+11 1.68e+11 135.55 < 2e-16 ***
## X1stFlrSF 1 1.79e+11 1.79e+11 144.54 < 2e-16 ***
## X2ndFlrSF 1 4.65e+11 4.65e+11 375.61 < 2e-16 ***
## BsmtFullBath 1 1.91e+10 1.91e+10 15.47 8.8e-05 ***
## BedroomAbvGr 1 3.12e+10 3.12e+10 25.21 5.8e-07 ***
## TotRmsAbvGrd 1 1.87e+10 1.87e+10 15.13 0.00011 ***
## Fireplaces 1 1.17e+10 1.17e+10 9.41 0.00219 **
## GarageCars 1 4.60e+10 4.60e+10 37.14 1.4e-09 ***
## WoodDeckSF 1 1.33e+10 1.33e+10 10.73 0.00108 **
## Residuals 1444 1.79e+12 1.24e+09
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(res.aov3, 2)
plot(res.aov3, 1)
ggplot(step.model, aes(x=.resid))+geom_histogram(binwidth = 30)+ geom_histogram(bins=30)
#Check for VIFs
VIF(step.model)
## MSSubClass LotArea OverallQual YearBuilt YearRemodAdd MasVnrArea
## 1.223 1.201 2.777 2.188 1.820 1.353
## BsmtFinSF1 X1stFlrSF X2ndFlrSF BsmtFullBath BedroomAbvGr TotRmsAbvGrd
## 2.187 3.713 3.768 1.825 2.194 4.304
## Fireplaces GarageCars WoodDeckSF
## 1.447 1.918 1.149
#create vector of VIF values
vif_values1 <- VIF(step.model)
#create horizontal bar chart to display and check each VIF values again after step wise regression
barplot(vif_values1, main = "Fig6: VIF Values after step-wise regression", horiz = TRUE, col = "turquoise1",las=2,cex.names=.53,xlim=c(0,5))
#add vertical line at 5
abline(v = 5, lwd = 3, lty = 2)
#check if outliers needs our attention or not
outlierTest(step.model)
## rstudent unadjusted p-value Bonferroni p
## 1299 -16.522 2.687e-56 3.923e-53
## 524 -10.337 3.267e-24 4.770e-21
## 1183 8.509 4.313e-17 6.298e-14
## 692 8.128 9.317e-16 1.360e-12
## 804 6.923 6.613e-12 9.655e-09
## 899 6.268 4.827e-10 7.048e-07
## 1047 6.154 9.790e-10 1.429e-06
## 1170 4.790 1.841e-06 2.688e-03
## 441 4.706 2.766e-06 4.039e-03
1) Slope of Coeficents are still signifcant T-stat > 2 with corresponding low p-values (<0.05)
2) F-values exhibit statistical significance
3) VIF values of predictors are all lower than 5
4) The adjusted R-Square of 0.804 is a marked improvement from Model 1 which was at 0.771
5) The residuals of model2 looks better though; (A) The residuals exhibit some heterodasticity but the curvature is "less" as predictor values increases (B) Normality of residuals is not perfect but saw a marked improvement (C) Histogram of residuals revealed a few outliers still and a significance of Outlier Tests were performed to see if it needed corrective actions
6)Several identified outliers had low p values which indicated that they do not seem to be significant (see last graph)
# Read in from Kaggle the test data set
url2<- "https://raw.githubusercontent.com/ssufian/DAT-605/master/test.csv"
test_df <- read.csv(url2, header = TRUE, sep = ",",stringsAsFactors = FALSE)
#replace NA with zeros for numeric columns like in Model 1
test_df1<-test_df %>% mutate_all(~replace(., is.na(.), 0))
#Filter Test Data from Kaggle to reflect only the numerical predictors that we had used in the training set
test_df2<- dplyr::select(test_df1,Id,MSSubClass,LotFrontage,LotArea,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,X1stFlrSF,X2ndFlrSF ,BsmtFullBath ,BedroomAbvGr,TotRmsAbvGrd , Fireplaces ,GarageCars,WoodDeckSF,PoolArea)
#Make predictions with selected predictors
test_results<-predict(step.model, test_df2)
head(test_results) #checking at results
## 1 2 3 4 5 6
## 113289 165950 175587 198751 189161 184604
#Making columns per Kaggle's instructions
prediction <- data.frame(Id = test_df2[,"Id"], SalePrice = test_results)
head(prediction, 10)
## Id SalePrice
## 1 1461 113289
## 2 1462 165950
## 3 1463 175587
## 4 1464 198751
## 5 1465 189161
## 6 1466 184604
## 7 1467 194618
## 8 1468 173252
## 9 1469 216060
## 10 1470 116969
write.csv(prediction, "salepricepredictions.csv")
My Kaggle user name: s_sufian@hotmailcom
#
YouTube link: https://youtu.be/nZtMCDD_918