library(readr)
library(tidyverse)
library(stringr)
library(readr)
library(purrr)
library(plyr)
library(corrplot)
library(Hmisc)
library(GGally)
library(matlib)
library(MASS)
You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques . I want you to do the following:
path = "https://raw.githubusercontent.com/kelloggjohnd/Data605/master/train.csv"
train.set<- read_csv(file = path)
## Parsed with column specification:
## cols(
## .default = col_character(),
## Id = col_double(),
## MSSubClass = col_double(),
## LotFrontage = col_double(),
## LotArea = col_double(),
## OverallQual = col_double(),
## OverallCond = col_double(),
## YearBuilt = col_double(),
## YearRemodAdd = col_double(),
## MasVnrArea = col_double(),
## BsmtFinSF1 = col_double(),
## BsmtFinSF2 = col_double(),
## BsmtUnfSF = col_double(),
## TotalBsmtSF = col_double(),
## `1stFlrSF` = col_double(),
## `2ndFlrSF` = col_double(),
## LowQualFinSF = col_double(),
## GrLivArea = col_double(),
## BsmtFullBath = col_double(),
## BsmtHalfBath = col_double(),
## FullBath = col_double()
## # ... with 18 more columns
## )
## See spec(...) for full column specifications.
summary(train.set)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1 Min. : 20.0 Length:1460 Min. : 21
## 1st Qu.: 366 1st Qu.: 20.0 Class :character 1st Qu.: 59
## Median : 730 Median : 50.0 Mode :character Median : 69
## Mean : 730 Mean : 56.9 Mean : 70
## 3rd Qu.:1095 3rd Qu.: 70.0 3rd Qu.: 80
## Max. :1460 Max. :190.0 Max. :313
## NA's :259
## LotArea Street Alley LotShape
## Min. : 1300 Length:1460 Length:1460 Length:1460
## 1st Qu.: 7554 Class :character Class :character Class :character
## Median : 9478 Mode :character Mode :character Mode :character
## Mean : 10517
## 3rd Qu.: 11602
## Max. :215245
##
## LandContour Utilities LotConfig
## Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## LandSlope Neighborhood Condition1
## Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Condition2 BldgType HouseStyle OverallQual
## Length:1460 Length:1460 Length:1460 Min. : 1.0
## Class :character Class :character Class :character 1st Qu.: 5.0
## Mode :character Mode :character Mode :character Median : 6.0
## Mean : 6.1
## 3rd Qu.: 7.0
## Max. :10.0
##
## OverallCond YearBuilt YearRemodAdd RoofStyle
## Min. :1.00 Min. :1872 Min. :1950 Length:1460
## 1st Qu.:5.00 1st Qu.:1954 1st Qu.:1967 Class :character
## Median :5.00 Median :1973 Median :1994 Mode :character
## Mean :5.58 Mean :1971 Mean :1985
## 3rd Qu.:6.00 3rd Qu.:2000 3rd Qu.:2004
## Max. :9.00 Max. :2010 Max. :2010
##
## RoofMatl Exterior1st Exterior2nd
## Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## MasVnrType MasVnrArea ExterQual ExterCond
## Length:1460 Min. : 0 Length:1460 Length:1460
## Class :character 1st Qu.: 0 Class :character Class :character
## Mode :character Median : 0 Mode :character Mode :character
## Mean : 104
## 3rd Qu.: 166
## Max. :1600
## NA's :8
## Foundation BsmtQual BsmtCond
## Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Length:1460 Length:1460 Min. : 0 Length:1460
## Class :character Class :character 1st Qu.: 0 Class :character
## Mode :character Mode :character Median : 384 Mode :character
## Mean : 444
## 3rd Qu.: 712
## Max. :5644
##
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## Min. : 0.0 Min. : 0 Min. : 0 Length:1460
## 1st Qu.: 0.0 1st Qu.: 223 1st Qu.: 796 Class :character
## Median : 0.0 Median : 478 Median : 992 Mode :character
## Mean : 46.5 Mean : 567 Mean :1057
## 3rd Qu.: 0.0 3rd Qu.: 808 3rd Qu.:1298
## Max. :1474.0 Max. :2336 Max. :6110
##
## HeatingQC CentralAir Electrical 1stFlrSF
## Length:1460 Length:1460 Length:1460 Min. : 334
## Class :character Class :character Class :character 1st Qu.: 882
## Mode :character Mode :character Mode :character Median :1087
## Mean :1163
## 3rd Qu.:1391
## Max. :4692
##
## 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Min. : 0 Min. : 0.0 Min. : 334 Min. :0.000
## 1st Qu.: 0 1st Qu.: 0.0 1st Qu.:1130 1st Qu.:0.000
## Median : 0 Median : 0.0 Median :1464 Median :0.000
## Mean : 347 Mean : 5.8 Mean :1515 Mean :0.425
## 3rd Qu.: 728 3rd Qu.: 0.0 3rd Qu.:1777 3rd Qu.:1.000
## Max. :2065 Max. :572.0 Max. :5642 Max. :3.000
##
## BsmtHalfBath FullBath HalfBath BedroomAbvGr
## Min. :0.0000 Min. :0.00 Min. :0.000 Min. :0.00
## 1st Qu.:0.0000 1st Qu.:1.00 1st Qu.:0.000 1st Qu.:2.00
## Median :0.0000 Median :2.00 Median :0.000 Median :3.00
## Mean :0.0575 Mean :1.57 Mean :0.383 Mean :2.87
## 3rd Qu.:0.0000 3rd Qu.:2.00 3rd Qu.:1.000 3rd Qu.:3.00
## Max. :2.0000 Max. :3.00 Max. :2.000 Max. :8.00
##
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.00 Length:1460 Min. : 2.00 Length:1460
## 1st Qu.:1.00 Class :character 1st Qu.: 5.00 Class :character
## Median :1.00 Mode :character Median : 6.00 Mode :character
## Mean :1.05 Mean : 6.52
## 3rd Qu.:1.00 3rd Qu.: 7.00
## Max. :3.00 Max. :14.00
##
## Fireplaces FireplaceQu GarageType GarageYrBlt
## Min. :0.000 Length:1460 Length:1460 Min. :1900
## 1st Qu.:0.000 Class :character Class :character 1st Qu.:1961
## Median :1.000 Mode :character Mode :character Median :1980
## Mean :0.613 Mean :1978
## 3rd Qu.:1.000 3rd Qu.:2002
## Max. :3.000 Max. :2010
## NA's :81
## GarageFinish GarageCars GarageArea GarageQual
## Length:1460 Min. :0.00 Min. : 0 Length:1460
## Class :character 1st Qu.:1.00 1st Qu.: 334 Class :character
## Mode :character Median :2.00 Median : 480 Mode :character
## Mean :1.77 Mean : 473
## 3rd Qu.:2.00 3rd Qu.: 576
## Max. :4.00 Max. :1418
##
## GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Length:1460 Length:1460 Min. : 0.0 Min. : 0.0
## Class :character Class :character 1st Qu.: 0.0 1st Qu.: 0.0
## Mode :character Mode :character Median : 0.0 Median : 25.0
## Mean : 94.2 Mean : 46.7
## 3rd Qu.:168.0 3rd Qu.: 68.0
## Max. :857.0 Max. :547.0
##
## EnclosedPorch 3SsnPorch ScreenPorch PoolArea
## Min. : 0 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 0 Median : 0.0 Median : 0.0 Median : 0.0
## Mean : 22 Mean : 3.4 Mean : 15.1 Mean : 2.8
## 3rd Qu.: 0 3rd Qu.: 0.0 3rd Qu.: 0.0 3rd Qu.: 0.0
## Max. :552 Max. :508.0 Max. :480.0 Max. :738.0
##
## PoolQC Fence MiscFeature MiscVal
## Length:1460 Length:1460 Length:1460 Min. : 0
## Class :character Class :character Class :character 1st Qu.: 0
## Mode :character Mode :character Mode :character Median : 0
## Mean : 43
## 3rd Qu.: 0
## Max. :15500
##
## MoSold YrSold SaleType SaleCondition
## Min. : 1.00 Min. :2006 Length:1460 Length:1460
## 1st Qu.: 5.00 1st Qu.:2007 Class :character Class :character
## Median : 6.00 Median :2008 Mode :character Mode :character
## Mean : 6.32 Mean :2008
## 3rd Qu.: 8.00 3rd Qu.:2009
## Max. :12.00 Max. :2010
##
## SalePrice
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
##
Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any three quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?
In order to ensure we have clean data to work from, I removed any column with heavy NA values, such as alley and lot frontage. Replacing the NA’s in these columns will not give any value to our purposes.
First I want to cast a very wide net to see on a simple correlation of non-categorical columns, if there is any stiking correlations I can work with later. Using this data, I can see what over varibles could be added
corr.df <- train.set%>%
dplyr::select("MSSubClass","LotArea","OverallQual","OverallCond","YearBuilt","YearRemodAdd","BsmtFinSF1","BsmtUnfSF","TotalBsmtSF","1stFlrSF","2ndFlrSF","LowQualFinSF","GrLivArea","BsmtFullBath","BsmtHalfBath","FullBath","HalfBath","BedroomAbvGr","KitchenAbvGr","TotRmsAbvGrd","Fireplaces","GarageYrBlt","GarageCars","GarageArea","WoodDeckSF","OpenPorchSF","EnclosedPorch","3SsnPorch","ScreenPorch","PoolArea","MiscVal","MoSold","YrSold","SalePrice")%>%
replace(is.na(.),0)
corr.data <- cor(corr.df)
corrplot(corr.data, order = "hclust", tl.col = "black", tl.srt = 45, method = "ellipse", bg="black")
The graph shows a lot of strong correlations on Above ground living area (GrLivArea). There is definate correlation between GrLivArea and total rooms above ground [TotRmsAbvGrd] (which makes sense). Another correlation which makes sense is SalePrice and GrLivArea.
The Graph also shows strong correlations on Overall Quality (OverallQual).
corr.data.df <- as.data.frame(corr.data)
rnames <- list("OverallQual", "SalePrice", "GrLivArea", "TotRmsAbvGrd","LotArea", "OverallCond")
corr.data.matrix <- corr.data.df %>%
dplyr::select(OverallQual, SalePrice, GrLivArea, TotRmsAbvGrd, LotArea, OverallCond)
corr.data.matrix<-subset(corr.data.matrix, row.names(corr.data.matrix) %in% rnames)
train.set$OverallQual.f <-as.factor(as.character(train.set$OverallQual))
ggplot(train.set, aes(x=OverallQual, y=SalePrice, fill=OverallQual.f))+
geom_boxplot()+
ggtitle("Overall Quality & Sale Price")
ggplot(train.set, aes(x=GrLivArea, y=SalePrice, fill=OverallQual.f))+
geom_boxplot()+
ggtitle("Ground Living Area & Sale Price")
ggplot(train.set, aes(x=TotRmsAbvGrd, y=SalePrice, fill=OverallQual.f))+
geom_boxplot()+
ggtitle("Total Rooms above Ground & Sale Price")
corr.data.matrix
## OverallQual SalePrice GrLivArea TotRmsAbvGrd LotArea
## LotArea 0.10581 0.26384 0.26312 0.19001 1.000000
## OverallQual 1.00000 0.79098 0.59301 0.42745 0.105806
## OverallCond -0.09193 -0.07786 -0.07969 -0.05758 -0.005636
## GrLivArea 0.59301 0.70862 1.00000 0.82549 0.263116
## TotRmsAbvGrd 0.42745 0.53372 0.82549 1.00000 0.190015
## SalePrice 0.79098 1.00000 0.70862 0.53372 0.263843
## OverallCond
## LotArea -0.005636
## OverallQual -0.091932
## OverallCond 1.000000
## GrLivArea -0.079686
## TotRmsAbvGrd -0.057583
## SalePrice -0.077856
cor.test(corr.data.matrix$TotRmsAbvGrd,corr.data.matrix$GrLivArea, conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: corr.data.matrix$TotRmsAbvGrd and corr.data.matrix$GrLivArea
## t = 5.3, df = 4, p-value = 0.006
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.7470 0.9851
## sample estimates:
## cor
## 0.9362
This was a given to test the data. It would make logical sense for total Rooms above ground to be correlated to Above Ground Living area. The P value is acceptably low, the correlation is NOT zero and the interval is close.
cor.test(corr.data.matrix$OverallQual,corr.data.matrix$SalePrice, conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: corr.data.matrix$OverallQual and corr.data.matrix$SalePrice
## t = 5.2, df = 4, p-value = 0.007
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.7331 0.9842
## sample estimates:
## cor
## 0.9323
The correlation is NOT zero. The P value is acceptible and the confidence interval is within a decent gap of each other to be confident in the model
cor.test(corr.data.matrix$GrLivArea,corr.data.matrix$SalePrice, conf.level = 0.80)
##
## Pearson's product-moment correlation
##
## data: corr.data.matrix$GrLivArea and corr.data.matrix$SalePrice
## t = 2.8, df = 4, p-value = 0.05
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.3705 0.9535
## sample estimates:
## cor
## 0.8106
The correlation is NOT zero and has very close to an acceptible P value (near enough to <0.05 to accept it). We have a large large confidence interval of between 0.37 and .95 leading us to not be as confident in the model
cor.test(corr.data.matrix$TotRmsAbvGrd,corr.data.matrix$SalePrice, conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: corr.data.matrix$TotRmsAbvGrd and corr.data.matrix$SalePrice
## t = 1.6, df = 4, p-value = 0.2
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## -0.004588 0.900569
## sample estimates:
## cor
## 0.6263
The correlation is NOT zero. The p value is NOT acceptible and the confidence interval has a larger gap with each other than other models we will need run further work to be confident in the model.
Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.
corr.matrix <- as.matrix(corr.data.matrix)
(precisionmatrix <- solve(corr.matrix))
## LotArea OverallQual OverallCond GrLivArea TotRmsAbvGrd
## OverallQual 0.33252 2.80353 0.07353 -0.43954 0.23295
## SalePrice -0.41401 -2.11241 -0.01098 -1.35036 0.14210
## GrLivArea -0.28038 -0.43954 0.05977 4.70614 -2.91955
## TotRmsAbvGrd 0.09491 0.23295 -0.01446 -2.91955 3.21577
## LotArea 1.12972 0.33252 -0.01217 -0.28038 0.09491
## OverallCond -0.01217 0.07353 1.00977 0.05977 -0.01446
## SalePrice
## OverallQual -2.11241
## SalePrice 3.66031
## GrLivArea -1.35036
## TotRmsAbvGrd 0.14210
## LotArea -0.41401
## OverallCond -0.01098
matrix1 <- as.matrix(corr.data.matrix) %*% as.matrix(precisionmatrix)
matrix1
## LotArea OverallQual
## LotArea 1.000000000000000000000 0.00000000000000009259
## OverallQual 0.000000000000000008240 1.00000000000000044409
## OverallCond -0.000000000000000003469 -0.00000000000000001388
## GrLivArea 0.000000000000000049006 -0.00000000000000013618
## TotRmsAbvGrd -0.000000000000000051608 0.00000000000000015786
## SalePrice -0.000000000000000009541 -0.00000000000000003816
## OverallCond GrLivArea
## LotArea 0.000000000000000000000 -0.000000000000000140133
## OverallQual 0.000000000000000000000 0.000000000000000001735
## OverallCond 1.000000000000000000000 0.000000000000000006939
## GrLivArea 0.000000000000000000000 1.000000000000000000000
## TotRmsAbvGrd 0.000000000000000006939 0.000000000000000385542
## SalePrice 0.000000000000000000000 -0.000000000000000272352
## TotRmsAbvGrd SalePrice
## LotArea -0.00000000000000012059 -0.000000000000000052530
## OverallQual 0.00000000000000002494 0.000000000000000022985
## OverallCond -0.00000000000000001041 0.000000000000000019082
## GrLivArea 0.00000000000000004055 -0.000000000000000002711
## TotRmsAbvGrd 0.99999999999999977796 -0.000000000000000049440
## SalePrice -0.00000000000000003903 1.000000000000000000000
matrix2 <-as.matrix(precisionmatrix) %*% as.matrix(corr.data.matrix)
matrix2
## OverallQual SalePrice
## OverallQual 1.00000000000000044409 0.000000000000000444089
## SalePrice 0.00000000000000000000 0.999999999999999555911
## GrLivArea 0.00000000000000000000 -0.000000000000000222045
## TotRmsAbvGrd 0.00000000000000040246 0.000000000000000527356
## LotArea 0.00000000000000000000 0.000000000000000000000
## OverallCond 0.00000000000000001214 0.000000000000000008674
## GrLivArea TotRmsAbvGrd
## OverallQual 0.000000000000000000000 0.000000000000000222045
## SalePrice -0.000000000000000444089 -0.000000000000000444089
## GrLivArea 0.999999999999999555911 0.000000000000000333067
## TotRmsAbvGrd 0.000000000000000180411 1.000000000000000000000
## LotArea -0.000000000000000055511 -0.000000000000000111022
## OverallCond 0.000000000000000007806 0.000000000000000008674
## LotArea OverallCond
## OverallQual 0.000000000000000111022 -0.000000000000000027756
## SalePrice -0.000000000000000222045 0.000000000000000055511
## GrLivArea -0.000000000000000055511 -0.000000000000000013878
## TotRmsAbvGrd -0.000000000000000027756 -0.000000000000000078063
## LotArea 1.000000000000000222045 0.000000000000000006939
## OverallCond 0.000000000000000002168 1.000000000000000000000
b <- corr.data.matrix$SalePrice
LU(as.matrix(corr.data.matrix),b, verbose = FALSE)
## $P
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 1 0 0 0 0 0
## [2,] 0 1 0 0 0 0
## [3,] 0 0 1 0 0 0
## [4,] 0 0 0 1 0 0
## [5,] 0 0 0 0 1 0
## [6,] 0 0 0 0 0 1
##
## $L
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 1.0000 0.00000 0.000 0.0000 0.000 0
## [2,] 9.4513 1.00000 0.000 0.0000 0.000 0
## [3,] -0.8689 -0.08891 1.000 0.0000 0.000 0
## [4,] 5.6047 0.45231 -19.631 1.0000 0.000 0
## [5,] 4.0400 0.31256 -18.220 3.9593 1.000 0
## [6,] 7.4758 0.57112 9.088 0.2305 -1.317 1
##
## $U
## OverallQual SalePrice GrLivArea TotRmsAbvGrd LotArea OverallCond
## [1,] 0.1058 0.2638 0.26312 0.19001 1.0000 -0.005636
## [2,] 0.0000 -1.7027 -1.89378 -1.36843 -9.3455 -0.038662
## [3,] 0.0000 0.0000 -0.01945 -0.01416 0.0323 0.991665
## [4,] 0.0000 0.0000 0.00000 0.10157 -0.4805 19.436660
## [5,] 0.0000 0.0000 0.00000 0.00000 1.5620 -58.910675
## [6,] 0.0000 0.0000 0.00000 0.00000 0.0000 -91.095864
##
## $d
## [,1]
## [1,] 0.2638
## [2,] -1.7027
## [3,] 0.0000
## [4,] 0.0000
## [5,] 0.0000
## [6,] 0.0000
##
## $x
## [,1]
## [1,] 0
## [2,] 1
## [3,] 0
## [4,] 0
## [5,] 0
## [6,] 0
Many times, it makes sense to fit a closed form distribution to data.
hist(train.set$LotArea,breaks = 50)
fitmodel<-train.set$LotArea
min(fitmodel)
## [1] 1300
fit <- fitdistr(fitmodel, "exponential")
fit
## rate
## 0.000095086
## (0.000002489)
optmodel<- fit$estimate
sim<- rexp(1000,optmodel)
hist(sim,breaks = 50)
quantile(sim, probs = c(0.05,0.95))
## 5% 95%
## 644 28987
normality <- rnorm(length(fitmodel),mean(fitmodel),sd(fitmodel))
hist(normality)
quantile(normality, probs = c(0.05,0.95))
## 5% 95%
## -6131 26138
normality.df <- data.frame(length = normality)
normality.df$from <- "Normality"
sim.df <- data.frame(length = sim)
sim.df$from <- "Sim"
fitmodel.df <- data.frame(length = fitmodel)
fitmodel.df$from <- "Model"
total.df <- rbind(normality.df, sim.df, fitmodel.df)
ggplot(total.df, aes(length, fill=from))+
geom_density(alpha =0.5)
It seems the data had a lot of huge outlyers which through the model into a heavy right skew at first. When looked at when compared to the Normilization and simulation, the Model data stays very close to both of them.
10 points. Modeling. Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.
good.corr <- corr.data.df%>%
mutate(Vname = row.names(corr.data.df))%>%
filter(SalePrice >=.3)%>%
dplyr::select(Vname,SalePrice)
train.frame <- dplyr::select(corr.df, good.corr$Vname)%>%
mutate(YearBuilt.m = 2017-YearBuilt)%>%
dplyr::select(-YearBuilt)
train.model <- lm(SalePrice ~ .,data = train.frame)
summary(train.model)
##
## Call:
## lm(formula = SalePrice ~ ., data = train.frame)
##
## Residuals:
## Min 1Q Median 3Q Max
## -515923 -17443 -1977 14269 288868
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -703322.67 122643.83 -5.73 0.0000000118802 ***
## OverallQual 19407.48 1173.62 16.54 < 0.0000000000000002 ***
## YearRemodAdd 323.37 62.26 5.19 0.0000002358310 ***
## BsmtFinSF1 18.32 2.60 7.04 0.0000000000029 ***
## TotalBsmtSF 12.42 4.33 2.87 0.00418 **
## `1stFlrSF` 32.00 20.96 1.53 0.12710
## `2ndFlrSF` 23.65 20.61 1.15 0.25135
## GrLivArea 17.73 20.52 0.86 0.38758
## FullBath -2423.25 2639.40 -0.92 0.35872
## TotRmsAbvGrd 1668.36 1097.28 1.52 0.12862
## Fireplaces 7697.14 1791.53 4.30 0.0000185196269 ***
## GarageCars 10216.91 2985.56 3.42 0.00064 ***
## GarageArea 12.89 10.12 1.27 0.20310
## WoodDeckSF 31.75 8.18 3.88 0.00011 ***
## OpenPorchSF 6.31 15.77 0.40 0.68898
## YearBuilt.m -191.59 49.72 -3.85 0.00012 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36700 on 1444 degrees of freedom
## Multiple R-squared: 0.789, Adjusted R-squared: 0.786
## F-statistic: 359 on 15 and 1444 DF, p-value: <0.0000000000000002
plot(train.model)
par(mfrow = c(2,3))
X1 <- train.frame$OverallQual
X2 <- train.frame$GrLivArea
X3 <- train.frame$TotalBsmtSF
X4 <- train.frame$GarageArea
X5 <- train.frame$TotRmsAbvGrd
Y1 <- train.frame$SalePrice
plot(X1,Y1, main = "OverAll Quality", ylab = "Sale Price")
abline (lm(Y1~X1),col ="Red",lwd =3)
plot(X2,Y1, main = "Ground Living Area", ylab = "Sale Price")
abline (lm(Y1~X2),col ="Red",lwd =3)
plot(X3,Y1, main = "Total Basement SF", ylab = "Sale Price")
abline (lm(Y1~X3),col ="Red",lwd =3)
plot(X4,Y1, main = "Garage Area", ylab = "Sale Price")
abline (lm(Y1~X4),col ="Red",lwd =3)
plot(X5,Y1, main = "Total Rooms AbvGrd", ylab = "Sale Price")
abline (lm(Y1~X5),col ="Red",lwd =3)
train.frame <- train.frame%>%
dplyr::select(OverallQual,GrLivArea,TotalBsmtSF,GarageArea,TotRmsAbvGrd, SalePrice)
train.model.best <- lm(SalePrice ~ .,data = train.frame)
ggpairs(
train.frame,
lower = list(continuous = ggally_points, combo = ggally_dot_no_facet)
)
path2 = "https://raw.githubusercontent.com/kelloggjohnd/Data605/master/test.csv"
test.set<- read_csv(file = path2)
## Parsed with column specification:
## cols(
## .default = col_character(),
## Id = col_double(),
## MSSubClass = col_double(),
## LotFrontage = col_double(),
## LotArea = col_double(),
## OverallQual = col_double(),
## OverallCond = col_double(),
## YearBuilt = col_double(),
## YearRemodAdd = col_double(),
## MasVnrArea = col_double(),
## BsmtFinSF1 = col_double(),
## BsmtFinSF2 = col_double(),
## BsmtUnfSF = col_double(),
## TotalBsmtSF = col_double(),
## `1stFlrSF` = col_double(),
## `2ndFlrSF` = col_double(),
## LowQualFinSF = col_double(),
## GrLivArea = col_double(),
## BsmtFullBath = col_double(),
## BsmtHalfBath = col_double(),
## FullBath = col_double()
## # ... with 17 more columns
## )
## See spec(...) for full column specifications.
test.set <- test.set %>%
dplyr::select("Id","MSSubClass","LotArea","OverallQual","OverallCond","YearBuilt","YearRemodAdd","BsmtFinSF1","BsmtUnfSF","TotalBsmtSF","1stFlrSF","2ndFlrSF","LowQualFinSF","GrLivArea","BsmtFullBath","BsmtHalfBath","FullBath","HalfBath","BedroomAbvGr","KitchenAbvGr","TotRmsAbvGrd","Fireplaces","GarageYrBlt","GarageCars","GarageArea","WoodDeckSF","OpenPorchSF","EnclosedPorch","3SsnPorch","ScreenPorch","PoolArea","MiscVal","MoSold","YrSold")%>%
replace(is.na(.),0)
Testframe <- dplyr::select(test.set, good.corr$Vname[1:15])%>%
mutate(YearBuilt.m = 2017-YearBuilt)%>%
dplyr::select(-YearBuilt)
Test.model <- model.matrix(YearBuilt.m ~ OverallQual+ GrLivArea+ TotalBsmtSF+ GarageArea+ TotRmsAbvGrd, Testframe)%>%
as.data.frame%>%
dplyr::select(-'(Intercept)')
sale.prediction <- predict.lm(train.model.best,Test.model)
sale.prediction <- as.data.frame(sale.prediction)
Product <- data.frame(ID = test.set$Id, SalePrice = sale.prediction$sale.prediction)
head(Product)
## ID SalePrice
## 1 1461 134881
## 2 1462 168721
## 3 1463 155533
## 4 1464 178303
## 5 1465 226404
## 6 1466 174023
write_csv(Product, 'submission.csv')
Kaggle User name: johnkellogg
Kaggle Score of: .65805