You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques. I want you to do the following.

1 Environment Set Up

library(tidyverse)
library(psych)
library(reshape)
library(readxl)
library(scales)
library(corrgram)
library(MASS)
library(knitr)
library(corrplot)
library(fitdistrplus)
library(gmodels)
library(randomForest)

2 Data Acquisition

train <- read_csv('https://raw.githubusercontent.com/niteen11/CUNY_DATA_605/master/data_finalexam/kaggle%20data%20set/all/train.csv')
test <- read_csv('https://raw.githubusercontent.com/niteen11/CUNY_DATA_605/master/data_finalexam/kaggle%20data%20set/all/test.csv')

dim(train)

## [1] 1460   81

kable(head(train))

Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	LotConfig	LandSlope	Neighborhood	Condition1	Condition2	BldgType	HouseStyle	OverallQual	OverallCond	YearBuilt	YearRemodAdd	RoofStyle	RoofMatl	Exterior1st	Exterior2nd	MasVnrType	MasVnrArea	ExterQual	ExterCond	Foundation	BsmtQual	BsmtCond	BsmtExposure	BsmtFinType1	BsmtFinSF1	BsmtFinType2	BsmtUnfSF	TotalBsmtSF	Heating	HeatingQC	CentralAir	Electrical	1stFlrSF	2ndFlrSF	GrLivArea	BsmtFullBath	BsmtHalfBath	FullBath	HalfBath	BedroomAbvGr	KitchenAbvGr	KitchenQual	TotRmsAbvGrd	Functional	Fireplaces	FireplaceQu	GarageType	GarageYrBlt	GarageFinish	GarageCars	GarageArea	GarageQual	GarageCond	PavedDrive	WoodDeckSF	OpenPorchSF	EnclosedPorch	3SsnPorch	PoolQC	Fence	MiscFeature	MiscVal	MoSold	YrSold	SaleType	SaleCondition	SalePrice
1	60	RL	65	8450	Pave	NA	Reg	Lvl	AllPub	Inside	Gtl	CollgCr	Norm	Norm	1Fam	2Story	7	5	2003	2003	Gable	CompShg	VinylSd	VinylSd	BrkFace	196	Gd	TA	PConc	Gd	TA	No	GLQ	706	Unf	150	856	GasA	Ex	Y	SBrkr	856	854	1710	1	0	2	1	3	1	Gd	8	Typ	0	NA	Attchd	2003	RFn	2	548	TA	TA	Y	0	61	0	0	NA	NA	NA	0	2	2008	WD	Normal	208500
2	20	RL	80	9600	Pave	NA	Reg	Lvl	AllPub	FR2	Gtl	Veenker	Feedr	Norm	1Fam	1Story	6	8	1976	1976	Gable	CompShg	MetalSd	MetalSd	None	0	TA	TA	CBlock	Gd	TA	Gd	ALQ	978	Unf	284	1262	GasA	Ex	Y	SBrkr	1262	0	1262	0	1	2	0	3	1	TA	6	Typ	1	TA	Attchd	1976	RFn	2	460	TA	TA	Y	298	0	0	0	NA	NA	NA	0	5	2007	WD	Normal	181500
3	60	RL	68	11250	Pave	NA	IR1	Lvl	AllPub	Inside	Gtl	CollgCr	Norm	Norm	1Fam	2Story	7	5	2001	2002	Gable	CompShg	VinylSd	VinylSd	BrkFace	162	Gd	TA	PConc	Gd	TA	Mn	GLQ	486	Unf	434	920	GasA	Ex	Y	SBrkr	920	866	1786	1	0	2	1	3	1	Gd	6	Typ	1	TA	Attchd	2001	RFn	2	608	TA	TA	Y	0	42	0	0	NA	NA	NA	0	9	2008	WD	Normal	223500
4	70	RL	60	9550	Pave	NA	IR1	Lvl	AllPub	Corner	Gtl	Crawfor	Norm	Norm	1Fam	2Story	7	5	1915	1970	Gable	CompShg	Wd Sdng	Wd Shng	None	0	TA	TA	BrkTil	TA	Gd	No	ALQ	216	Unf	540	756	GasA	Gd	Y	SBrkr	961	756	1717	1	0	1	0	3	1	Gd	7	Typ	1	Gd	Detchd	1998	Unf	3	642	TA	TA	Y	0	35	272	0	NA	NA	NA	0	2	2006	WD	Abnorml	140000
5	60	RL	84	14260	Pave	NA	IR1	Lvl	AllPub	FR2	Gtl	NoRidge	Norm	Norm	1Fam	2Story	8	5	2000	2000	Gable	CompShg	VinylSd	VinylSd	BrkFace	350	Gd	TA	PConc	Gd	TA	Av	GLQ	655	Unf	490	1145	GasA	Ex	Y	SBrkr	1145	1053	2198	1	0	2	1	4	1	Gd	9	Typ	1	TA	Attchd	2000	RFn	3	836	TA	TA	Y	192	84	0	0	NA	NA	NA	0	12	2008	WD	Normal	250000
6	50	RL	85	14115	Pave	NA	IR1	Lvl	AllPub	Inside	Gtl	Mitchel	Norm	Norm	1Fam	1.5Fin	5	5	1993	1995	Gable	CompShg	VinylSd	VinylSd	None	0	TA	TA	Wood	Gd	TA	No	GLQ	732	Unf	64	796	GasA	Ex	Y	SBrkr	796	566	1362	1	0	1	1	1	1	TA	5	Typ	0	NA	Attchd	1993	Unf	2	480	TA	TA	Y	40	30	0	320	NA	MnPrv	Shed	700	10	2009	WD	Normal	143000

kable(head(test))

Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	LotConfig	LandSlope	Neighborhood	Condition1	Condition2	BldgType	HouseStyle	OverallQual	OverallCond	YearBuilt	YearRemodAdd	RoofStyle	RoofMatl	Exterior1st	Exterior2nd	MasVnrType	MasVnrArea	ExterQual	ExterCond	Foundation	BsmtQual	BsmtCond	BsmtExposure	BsmtFinType1	BsmtFinSF1	BsmtFinType2	BsmtFinSF2	BsmtUnfSF	TotalBsmtSF	Heating	HeatingQC	CentralAir	Electrical	1stFlrSF	2ndFlrSF	GrLivArea	FullBath	HalfBath	BedroomAbvGr	KitchenAbvGr	KitchenQual	TotRmsAbvGrd	Functional	Fireplaces	FireplaceQu	GarageType	GarageYrBlt	GarageFinish	GarageCars	GarageArea	GarageQual	GarageCond	PavedDrive	WoodDeckSF	OpenPorchSF	ScreenPorch	PoolQC	Fence	MiscFeature	MiscVal	MoSold	YrSold	SaleType	SaleCondition
1461	20	RH	80	11622	Pave	NA	Reg	Lvl	AllPub	Inside	Gtl	NAmes	Feedr	Norm	1Fam	1Story	5	6	1961	1961	Gable	CompShg	VinylSd	VinylSd	None	0	TA	TA	CBlock	TA	TA	No	Rec	468	LwQ	144	270	882	GasA	TA	Y	SBrkr	896	0	896	1	0	2	1	TA	5	Typ	0	NA	Attchd	1961	Unf	1	730	TA	TA	Y	140	0	120	NA	MnPrv	NA	0	6	2010	WD	Normal
1462	20	RL	81	14267	Pave	NA	IR1	Lvl	AllPub	Corner	Gtl	NAmes	Norm	Norm	1Fam	1Story	6	6	1958	1958	Hip	CompShg	Wd Sdng	Wd Sdng	BrkFace	108	TA	TA	CBlock	TA	TA	No	ALQ	923	Unf	0	406	1329	GasA	TA	Y	SBrkr	1329	0	1329	1	1	3	1	Gd	6	Typ	0	NA	Attchd	1958	Unf	1	312	TA	TA	Y	393	36	0	NA	NA	Gar2	12500	6	2010	WD	Normal
1463	60	RL	74	13830	Pave	NA	IR1	Lvl	AllPub	Inside	Gtl	Gilbert	Norm	Norm	1Fam	2Story	5	5	1997	1998	Gable	CompShg	VinylSd	VinylSd	None	0	TA	TA	PConc	Gd	TA	No	GLQ	791	Unf	0	137	928	GasA	Gd	Y	SBrkr	928	701	1629	2	1	3	1	TA	6	Typ	1	TA	Attchd	1997	Fin	2	482	TA	TA	Y	212	34	0	NA	MnPrv	NA	0	3	2010	WD	Normal
1464	60	RL	78	9978	Pave	NA	IR1	Lvl	AllPub	Inside	Gtl	Gilbert	Norm	Norm	1Fam	2Story	6	6	1998	1998	Gable	CompShg	VinylSd	VinylSd	BrkFace	20	TA	TA	PConc	TA	TA	No	GLQ	602	Unf	0	324	926	GasA	Ex	Y	SBrkr	926	678	1604	2	1	3	1	Gd	7	Typ	1	Gd	Attchd	1998	Fin	2	470	TA	TA	Y	360	36	0	NA	NA	NA	0	6	2010	WD	Normal
1465	120	RL	43	5005	Pave	NA	IR1	HLS	AllPub	Inside	Gtl	StoneBr	Norm	Norm	TwnhsE	1Story	8	5	1992	1992	Gable	CompShg	HdBoard	HdBoard	None	0	Gd	TA	PConc	Gd	TA	No	ALQ	263	Unf	0	1017	1280	GasA	Ex	Y	SBrkr	1280	0	1280	2	0	2	1	Gd	5	Typ	0	NA	Attchd	1992	RFn	2	506	TA	TA	Y	0	82	144	NA	NA	NA	0	1	2010	WD	Normal
1466	60	RL	75	10000	Pave	NA	IR1	Lvl	AllPub	Corner	Gtl	Gilbert	Norm	Norm	1Fam	2Story	6	5	1993	1994	Gable	CompShg	HdBoard	HdBoard	None	0	TA	TA	PConc	Gd	TA	No	Unf	0	Unf	0	763	763	GasA	Gd	Y	SBrkr	763	892	1655	2	1	3	1	TA	7	Typ	1	TA	Attchd	1993	Fin	2	440	TA	TA	Y	157	84	0	NA	NA	NA	0	4	2010	WD	Normal

3 Descriptive and Inferential Statistics

Descriptive and Inferential Statistics. Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any THREE quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide a 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

train %>%
  dplyr::select(LotArea, GrLivArea, TotalBsmtSF, SalePrice) %>%
  mutate(LotArea = (LotArea),
         GrLivArea = (GrLivArea),
         TotalBsmtSF = (TotalBsmtSF),
         SalePrice = (SalePrice)) %>%
  gather('key', 'value') %>%
  ggplot(aes(x = value, y = ..density..)) +
  geom_histogram(
                 fill = 'red',
                 alpha = 0.5,
                 bins = 15) +
  geom_density() +
  facet_wrap(~key, scales ='free_x') +
  labs(title = 'Histogram PLots')

ggplot(train, aes(train$LotArea, train$SalePrice)) +
  geom_point(aes(color=train$TotalBsmtSF)) +
  xlab("Lot Area") +
  ylab("Sales Price") +
  ggtitle("Sales Price vs. Lot Area") +
  scale_y_continuous(labels = comma)

cor <- train %>%
  dplyr::select(LotArea, GrLivArea, TotalBsmtSF, SalePrice) %>%
  cor() 

corrplot(cor, method = 'circle')

corrgram(cor, order=TRUE, upper.panel=panel.cor, main= "train")

cor.test(train$GrLivArea, train$SalePrice, method = 'pearson', conf.level = 0.80)

## 
##  Pearson's product-moment correlation
## 
## data:  train$GrLivArea and train$SalePrice
## t = 38.348, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.6915087 0.7249450
## sample estimates:
##       cor 
## 0.7086245

cor.test(train$TotalBsmtSF, train$SalePrice, method = 'pearson', conf.level = 0.80)

## 
##  Pearson's product-moment correlation
## 
## data:  train$TotalBsmtSF and train$SalePrice
## t = 29.671, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.5922142 0.6340846
## sample estimates:
##       cor 
## 0.6135806

cor.test(train$LotArea, train$SalePrice, method = 'pearson', conf.level = 0.80)

## 
##  Pearson's product-moment correlation
## 
## data:  train$LotArea and train$SalePrice
## t = 10.445, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.2323391 0.2947946
## sample estimates:
##       cor 
## 0.2638434

The p-values for all three variables are less than 0.05. The correlation falls within the 80% CI

The familywise error rate (FWE or FWER) is the probability of a coming to at least one false conclusion in a series of hypothesis tests. In other words, it’s the probability of making at least one Type I Error.

\[1-(1-\alpha )^c\]

1-(.20/3)

## [1] 0.9333333

Now checking corelation for familywise.

cor.test(train$GrLivArea, train$SalePrice, method = 'pearson', conf.level = 0.93)

## 
##  Pearson's product-moment correlation
## 
## data:  train$GrLivArea and train$SalePrice
## t = 38.348, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 93 percent confidence interval:
##  0.6841885 0.7314712
## sample estimates:
##       cor 
## 0.7086245

cor.test(train$TotalBsmtSF, train$SalePrice, method = 'pearson', conf.level = 0.93)

## 
##  Pearson's product-moment correlation
## 
## data:  train$TotalBsmtSF and train$SalePrice
## t = 29.671, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 93 percent confidence interval:
##  0.5831186 0.6423195
## sample estimates:
##       cor 
## 0.6135806

cor.test(train$LotArea, train$SalePrice, method = 'pearson', conf.level = 0.93)

## 
##  Pearson's product-moment correlation
## 
## data:  train$LotArea and train$SalePrice
## t = 10.445, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 93 percent confidence interval:
##  0.219153 0.307429
## sample estimates:
##       cor 
## 0.2638434

4 Linear Algebra and Correlation

. Invert your 3 x 3 correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.

kable(cor)

	LotArea	GrLivArea	TotalBsmtSF	SalePrice
LotArea	1.0000000	0.2631162	0.2608331	0.2638434
GrLivArea	0.2631162	1.0000000	0.4548682	0.7086245
TotalBsmtSF	0.2608331	0.4548682	1.0000000	0.6135806
SalePrice	0.2638434	0.7086245	0.6135806	1.0000000

precision_matrix <- solve(cor)
kable(precision_matrix)

	LotArea	GrLivArea	TotalBsmtSF	SalePrice
LotArea	1.1062218	-0.1623394	-0.1703170	-0.0723285
GrLivArea	-0.1623394	2.0350650	-0.0397442	-1.3748784
TotalBsmtSF	-0.1703170	-0.0397442	1.6321069	-0.9283283
SalePrice	-0.0723285	-1.3748784	-0.9283283	2.5629601

kable(cor %*% precision_matrix)

	LotArea	GrLivArea	TotalBsmtSF	SalePrice
LotArea	1	0	0	0
GrLivArea	0	1	0	0
TotalBsmtSF	0	0	1	0
SalePrice	0	0	0	1

kable(precision_matrix%*% cor)

	LotArea	GrLivArea	TotalBsmtSF	SalePrice
LotArea	1	0	0	0
GrLivArea	0	1	0	0
TotalBsmtSF	0	0	1	0
SalePrice	0	0	0	1

Other Approach

mat <- train %>%
  dplyr::select(GrLivArea, TotalBsmtSF, SalePrice) %>%
  cor() %>%
  round(4)
kable(mat)

	GrLivArea	TotalBsmtSF	SalePrice
GrLivArea	1.0000	0.4549	0.7086
TotalBsmtSF	0.4549	1.0000	0.6136
SalePrice	0.7086	0.6136	1.0000

prec.mat <- solve(mat) %>%
  round(4)
kable(prec.mat)

	GrLivArea	TotalBsmtSF	SalePrice
GrLivArea	2.0111	-0.0648	-1.3853
TotalBsmtSF	-0.0648	1.6060	-0.9395
SalePrice	-1.3853	-0.9395	2.5581

kable(prec.mat %*% mat %>%
  round(4))

	GrLivArea	TotalBsmtSF	SalePrice
GrLivArea	1	0	0
TotalBsmtSF	0	1	0
SalePrice	0	0	1

kable(mat %*% prec.mat %>%
  round(4))

	GrLivArea	TotalBsmtSF	SalePrice
GrLivArea	1	0	0
TotalBsmtSF	0	1	0
SalePrice	0	0	1

5 Calculus-Based Probability & Statistics.

Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ).

Find the optimal value of \(\lambda\) for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., \(rexp(1000,\lambda)\) )

par(mfrow=c(1,2))
hist(train$TotalBsmtSF, main="Histogram of TotalBsmtSF")
boxplot(train$TotalBsmtSF, main="Boxplot of TotalBsmtSF")

lambda <- fitdistr(train$TotalBsmtSF, densfun='exponential')
lambda$estimate

##         rate 
## 0.0009456896

set.seed(23)
pdf.dist <- rexp(1000,lambda$estimate)

hist(pdf.dist, freq = FALSE, breaks = 100, 
     main ="Fitted Exponential PDF with TotalBsmtSFTotal  ",
     xlab = "PDF TotalBsmtSFTotal",
     xlim = c(1, quantile(pdf.dist, 0.99)))
curve(dexp(x, rate = lambda$estimate), col = "red", add = TRUE)

Plot a histogram and compare it with a histogram of your original variable.

set.seed(23)
samp.TotalBsmtSF <- sample(train$TotalBsmtSF, 1000, replace=TRUE, prob=NULL)
exp.train <- data_frame(Expo=rexp(1000, lambda$estimate)) %>%
  mutate(TotalBsmtSF = samp.TotalBsmtSF)

plotdist(exp.train$TotalBsmtSF, histo = TRUE, demp = TRUE)

plotdist(exp.train$Expo, histo = TRUE, demp = TRUE)

hist(train$TotalBsmtSF, freq = FALSE, breaks = 100, 
     main ="Comparison with original TotalBsmtSF",
     xlab="Original TotalBsmtSF",
     xlim = c(1, quantile(train$TotalBsmtSF, 0.99)))
curve(dexp(x, rate = lambda$estimate), col = "red", add = TRUE)

Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF).

pdf_5th <- qexp(0.05, rate = lambda$estimate, lower.tail = TRUE, log.p = FALSE)
pdf_5th <- round(pdf_5th, 4)
pdf_95th <- qexp(0.95, rate = lambda$estimate, lower.tail = TRUE, log.p = FALSE)
pdf_95th <- round(pdf_95th, 4)

Also generate a 95% confidence interval from the empirical data, assuming normality.

ci(train$TotalBsmtSF, 0.95)

##   Estimate   CI lower   CI upper Std. Error 
## 1057.42945 1034.90755 1079.95135   11.48144

Finally, provide the empirical 5th percentile and 95th percentile of the data.

quantile(train$TotalBsmtSF, c(.05, .95))

##     5%    95% 
##  519.3 1753.0

6 Model

Cleaning Train dataset

names(train) <- make.names(names(train))
features <- setdiff(colnames(train), c("Id", "SalePrice"))
for (i in features) {
  if (any(is.na(train[[i]]))) 
    if (is.character(train[[i]])){ 
      train[[i]][is.na(train[[i]])] <- "Others"
    }else{
      train[[i]][is.na(train[[i]])] <- -999  
    }
}

col_class <- lapply(train,class)
col_class <- col_class[col_class != "factor"]
factor_levels <- lapply(train, nlevels)
factor_levels <- factor_levels[factor_levels > 1]
train <- train[,names(train) %in% c(names(factor_levels), names(col_class))]

train <- as.data.frame(unclass(train))
ftrain<-select_if(train, is.numeric)

Clean Test Dataset

names(test) <- make.names(names(test))
features <- setdiff(colnames(test), c("Id", "SalePrice"))
for (j in features) {
  if (any(is.na(test[[j]]))) 
    if (is.character(test[[j]])){ 
      test[[j]][is.na(test[[j]])] <- "Others"
    }else{
      test[[j]][is.na(test[[j]])] <- -999  
    }
}

col_class <- lapply(test,class)
col_class <- col_class[col_class != "factor"]
factor_levels <- lapply(test, nlevels)
factor_levels <- factor_levels[factor_levels > 1]
train <- test[,names(test) %in% c(names(factor_levels), names(col_class))]

test <- as.data.frame(unclass(test))
ftest<-select_if(test, is.numeric)

6.1 Baseline Linear Model

model1 <- lm(SalePrice ~ . , data=ftrain)
summary(model1)

## 
## Call:
## lm(formula = SalePrice ~ ., data = ftrain)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -475262  -16290   -2217   14029  295097 
## 
## Coefficients: (2 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4.143e+05  1.403e+06   0.295 0.767771    
## Id            -1.048e+00  2.171e+00  -0.483 0.629203    
## MSSubClass    -1.687e+02  2.608e+01  -6.468 1.36e-10 ***
## LotFrontage    3.109e+00  2.299e+00   1.352 0.176463    
## LotArea        4.024e-01  1.003e-01   4.013 6.30e-05 ***
## OverallQual    1.736e+04  1.181e+03  14.697  < 2e-16 ***
## OverallCond    5.153e+03  1.025e+03   5.027 5.61e-07 ***
## YearBuilt      3.497e+02  6.067e+01   5.764 1.01e-08 ***
## YearRemodAdd   1.115e+02  6.619e+01   1.684 0.092412 .  
## MasVnrArea     2.150e+01  5.193e+00   4.141 3.66e-05 ***
## BsmtFinSF1     1.900e+01  4.628e+00   4.105 4.27e-05 ***
## BsmtFinSF2     8.632e+00  7.011e+00   1.231 0.218432    
## BsmtUnfSF      8.368e+00  4.176e+00   2.004 0.045279 *  
## TotalBsmtSF           NA         NA      NA       NA    
## X1stFlrSF      4.812e+01  5.731e+00   8.397  < 2e-16 ***
## X2ndFlrSF      4.889e+01  4.917e+00   9.943  < 2e-16 ***
## LowQualFinSF   1.731e+01  1.970e+01   0.879 0.379726    
## GrLivArea             NA         NA      NA       NA    
## BsmtFullBath   8.486e+03  2.598e+03   3.267 0.001114 ** 
## BsmtHalfBath   1.862e+03  4.058e+03   0.459 0.646417    
## FullBath       3.219e+03  2.803e+03   1.148 0.250967    
## HalfBath      -1.563e+03  2.644e+03  -0.591 0.554423    
## BedroomAbvGr  -1.021e+04  1.683e+03  -6.066 1.68e-09 ***
## KitchenAbvGr  -1.543e+04  5.200e+03  -2.968 0.003048 ** 
## TotRmsAbvGrd   4.836e+03  1.230e+03   3.932 8.84e-05 ***
## Fireplaces     4.314e+03  1.766e+03   2.443 0.014694 *  
## GarageYrBlt   -9.649e+00  1.773e+00  -5.441 6.23e-08 ***
## GarageCars     1.568e+04  2.978e+03   5.266 1.61e-07 ***
## GarageArea     5.190e+00  9.707e+00   0.535 0.592957    
## WoodDeckSF     2.568e+01  7.923e+00   3.242 0.001215 ** 
## OpenPorchSF   -5.835e+00  1.509e+01  -0.387 0.698981    
## EnclosedPorch  1.222e+01  1.674e+01   0.730 0.465505    
## X3SsnPorch     2.017e+01  3.118e+01   0.647 0.517861    
## ScreenPorch    5.712e+01  1.707e+01   3.347 0.000837 ***
## PoolArea      -3.455e+01  2.346e+01  -1.473 0.140989    
## MiscVal       -3.759e-01  1.847e+00  -0.204 0.838748    
## MoSold        -5.100e+01  3.424e+02  -0.149 0.881631    
## YrSold        -6.839e+02  6.974e+02  -0.981 0.326921    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34500 on 1424 degrees of freedom
## Multiple R-squared:  0.816,  Adjusted R-squared:  0.8114 
## F-statistic: 180.4 on 35 and 1424 DF,  p-value: < 2.2e-16

par(mfrow=c(2,2))
plot(model1)

6.2 Significant Variables (LM)

sigvars <- data.frame(summary(model1)$coef[summary(model1)$coef[,4] <= .05, 4])
sigvars <- add_rownames(sigvars, "vars")

colist<-dplyr::pull(sigvars, vars)
matchid <- match(colist, names(ftrain))
train2 <- cbind(ftrain[,matchid], ftrain['SalePrice'])

model2<-lm(SalePrice ~ ., data=train2)
summary(model2)

## 
## Call:
## lm(formula = SalePrice ~ ., data = train2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -485362  -16106   -2176   14198  277137 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -8.060e+05  8.784e+04  -9.176  < 2e-16 ***
## MSSubClass   -1.648e+02  2.561e+01  -6.436 1.67e-10 ***
## LotArea       4.076e-01  9.914e-02   4.111 4.16e-05 ***
## OverallQual   1.821e+04  1.146e+03  15.884  < 2e-16 ***
## OverallCond   5.692e+03  9.195e+02   6.190 7.84e-10 ***
## YearBuilt     3.809e+02  4.455e+01   8.550  < 2e-16 ***
## MasVnrArea    2.041e+01  5.089e+00   4.010 6.39e-05 ***
## BsmtFinSF1    1.559e+01  3.862e+00   4.036 5.71e-05 ***
## BsmtUnfSF     6.304e+00  3.604e+00   1.749 0.080498 .  
## X1stFlrSF     5.211e+01  5.039e+00  10.341  < 2e-16 ***
## X2ndFlrSF     4.860e+01  4.029e+00  12.062  < 2e-16 ***
## BsmtFullBath  8.699e+03  2.379e+03   3.657 0.000264 ***
## BedroomAbvGr -1.037e+04  1.629e+03  -6.363 2.65e-10 ***
## KitchenAbvGr -1.610e+04  5.049e+03  -3.188 0.001463 ** 
## TotRmsAbvGrd  5.168e+03  1.200e+03   4.307 1.76e-05 ***
## Fireplaces    3.302e+03  1.718e+03   1.922 0.054783 .  
## GarageYrBlt  -1.041e+01  1.716e+00  -6.067 1.66e-09 ***
## GarageCars    1.766e+04  2.031e+03   8.696  < 2e-16 ***
## WoodDeckSF    2.517e+01  7.819e+00   3.219 0.001317 ** 
## ScreenPorch   5.360e+01  1.674e+01   3.203 0.001391 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34490 on 1440 degrees of freedom
## Multiple R-squared:  0.814,  Adjusted R-squared:  0.8115 
## F-statistic: 331.6 on 19 and 1440 DF,  p-value: < 2.2e-16

par(mfrow=c(2,2))
plot(model2)

6.3 Random Forest Model

A random forest, or random decision forests, are an “ensemble learning method for classification, regression, and other tasks.” To use the random forest method in R, the randomForest package must be loaded to create and analyze random forests

y_train <- ftrain$SalePrice
x_train_df <- subset(ftrain, select = -SalePrice)
model3 <- randomForest(x_train_df, y = y_train, ntree = 500, importance = T)
plot(model3)

7 Prediction

pred1<-predict(model1, ftest)
kaggle <- as.data.frame(cbind(ftest$Id, pred1))
colnames(kaggle) <- c("Id", "SalePrice")
#write.csv(kaggle, file = "Kaggle_housePrice_Submission1.csv", quote=FALSE, row.names=FALSE)

pred2<-predict(model2, ftest)
#export data for Kaggle
kaggle <- as.data.frame(cbind(ftest$Id, pred2))
colnames(kaggle) <- c("Id", "SalePrice")
#write.csv(kaggle, file = "Kaggle_housePrice_Submission2.csv", quote=FALSE, row.names=FALSE)

pred3<-predict(model3, ftest)
kaggle <- as.data.frame(cbind(ftest$Id, pred3))
colnames(kaggle) <- c("Id", "SalePrice")
#write.csv(kaggle, file = "Kaggle_housePrice_Submission3.csv", quote=FALSE, row.names=FALSE)

8 Conclusions

Out of the three models created, (Baseline LM, LM with highly significant variables, and RF), the Random Forest made the best predictions.

9 Reference

https://github.com/Kaggle/kaggle-api
https://www.kaggle.com/c/house-prices-advanced-regression-techniques

DATA605_Final_Exam_P2

Niteen Kumar

December 16, 2018