library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.0 v dplyr 1.0.5
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.0.5
## corrplot 0.84 loaded
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
# creates a sequence of 10000 numbers between 1 and n
X = function(n){
if (n>=6){
return(runif(10000,1,n))
}else{
print("Input a number greater or equal to 6")
}
}
# creates a normal dist with 10000 observations with mean= sd = (n+1)/2
Y = function(n){
mean = (n+1)/2
return(rnorm(10000,mean,mean))
}
# a P(X>x|X>y)
N=6
val.X = X(N)
val.Y = Y(N)
# x = median(X)
x = median(val.X)
# y = 1st quartile of Y
y = quantile(val.Y,0.25)
X.greater_x = val.X[val.X>x]
X.greater_y = val.X[val.X>y]
# p(X>x n X>y)
p.anb = val.X[val.X>x & val.X>y]
# Probability X>x given that X>y
length(p.anb)/length(X.greater_y)
## [1] 0.5138218
# b P(X>x n Y>y)
# p(X>x)
p.x = length(X.greater_x)/length(val.X)
# p (Y>y)
Y.greater_y = val.Y[val.Y>y]
p.y = length(Y.greater_y)/length(val.Y)
# p(X>x n Y>y) = P(X>x)*P(Y>y) since they are independent
p.x*p.y
## [1] 0.375
# c P(X<x|X>y)
# P(X<x n X>y)
p = val.X[val.X<x & val.X>y]
# Probability X less than x given X greater than y
length(p)/ length(X.greater_y)
## [1] 0.4861782
# investigate weather or not P(X>x n Y>y) = P(X>x)*P(Y>y) (Independent)
row1 = c(sum(val.X<=x & val.Y<=y)/10000,sum(val.X>x & val.Y<=y)/10000)
row2 = c(sum(val.X<=x & val.Y>y)/10000,sum(val.X>x & val.Y>y)/10000)
table = matrix(c(row1,row2),nrow=2)
table1 = rbind(table,apply(table,2,sum))
table1 = cbind(table1,apply(table1,1,sum))
row_names = c('P(Y<=y)','P(Y>y)','Total')
marginal_prob = data.frame(row_names,table1)
names(marginal_prob) = c('X/Y','P(X<=x)','P(X>x)','Total')
marginal_prob
## X/Y P(X<=x) P(X>x) Total
## 1 P(Y<=y) 0.1267 0.3733 0.5
## 2 P(Y>y) 0.1233 0.3767 0.5
## 3 Total 0.2500 0.7500 1.0
# P(X>x n Y>y)
marginal_prob[2,3]
## [1] 0.3767
# P(X>x) * p(Y>y)
marginal_prob[3,3]*marginal_prob[2,4]
## [1] 0.375
# I conclude that X and Y are independent due to P(X>x n Y>y) = P(X>x) * p(Y>y)
# fisher test
new_table = table*10000
fisher.test(new_table)
##
## Fisher's Exact Test for Count Data
##
## data: new_table
## p-value = 0.446
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.9461832 1.1363998
## sample estimates:
## odds ratio
## 1.036945
#chi squared test
chisq.test(new_table)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: new_table
## X-squared = 0.5808, df = 1, p-value = 0.446
# fisher test is for small samples while chi squared is for larger samples. Since our sample size is 10000 random variables, chi squared test would be more appropriate
# both tests conclude that we should assume independence as there is a large p-value
# loading data
prices.data = read.csv('https://raw.githubusercontent.com/schoolkidrich/CUNY_MSDS/main/DATA_605/housing_prices/train.csv')
prices.eval = read.csv('https://raw.githubusercontent.com/schoolkidrich/CUNY_MSDS/main/DATA_605/housing_prices/test.csv')
head(prices.data)
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour
## 1 1 60 RL 65 8450 Pave <NA> Reg Lvl
## 2 2 20 RL 80 9600 Pave <NA> Reg Lvl
## 3 3 60 RL 68 11250 Pave <NA> IR1 Lvl
## 4 4 70 RL 60 9550 Pave <NA> IR1 Lvl
## 5 5 60 RL 84 14260 Pave <NA> IR1 Lvl
## 6 6 50 RL 85 14115 Pave <NA> IR1 Lvl
## Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType
## 1 AllPub Inside Gtl CollgCr Norm Norm 1Fam
## 2 AllPub FR2 Gtl Veenker Feedr Norm 1Fam
## 3 AllPub Inside Gtl CollgCr Norm Norm 1Fam
## 4 AllPub Corner Gtl Crawfor Norm Norm 1Fam
## 5 AllPub FR2 Gtl NoRidge Norm Norm 1Fam
## 6 AllPub Inside Gtl Mitchel Norm Norm 1Fam
## HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl
## 1 2Story 7 5 2003 2003 Gable CompShg
## 2 1Story 6 8 1976 1976 Gable CompShg
## 3 2Story 7 5 2001 2002 Gable CompShg
## 4 2Story 7 5 1915 1970 Gable CompShg
## 5 2Story 8 5 2000 2000 Gable CompShg
## 6 1.5Fin 5 5 1993 1995 Gable CompShg
## Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 1 VinylSd VinylSd BrkFace 196 Gd TA PConc
## 2 MetalSd MetalSd None 0 TA TA CBlock
## 3 VinylSd VinylSd BrkFace 162 Gd TA PConc
## 4 Wd Sdng Wd Shng None 0 TA TA BrkTil
## 5 VinylSd VinylSd BrkFace 350 Gd TA PConc
## 6 VinylSd VinylSd None 0 TA TA Wood
## BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 1 Gd TA No GLQ 706 Unf
## 2 Gd TA Gd ALQ 978 Unf
## 3 Gd TA Mn GLQ 486 Unf
## 4 TA Gd No ALQ 216 Unf
## 5 Gd TA Av GLQ 655 Unf
## 6 Gd TA No GLQ 732 Unf
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 1 0 150 856 GasA Ex Y SBrkr
## 2 0 284 1262 GasA Ex Y SBrkr
## 3 0 434 920 GasA Ex Y SBrkr
## 4 0 540 756 GasA Gd Y SBrkr
## 5 0 490 1145 GasA Ex Y SBrkr
## 6 0 64 796 GasA Ex Y SBrkr
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 1 856 854 0 1710 1 0 2
## 2 1262 0 0 1262 0 1 2
## 3 920 866 0 1786 1 0 2
## 4 961 756 0 1717 1 0 1
## 5 1145 1053 0 2198 1 0 2
## 6 796 566 0 1362 1 0 1
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 1 1 3 1 Gd 8 Typ
## 2 0 3 1 TA 6 Typ
## 3 1 3 1 Gd 6 Typ
## 4 0 3 1 Gd 7 Typ
## 5 1 4 1 Gd 9 Typ
## 6 1 1 1 TA 5 Typ
## Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars
## 1 0 <NA> Attchd 2003 RFn 2
## 2 1 TA Attchd 1976 RFn 2
## 3 1 TA Attchd 2001 RFn 2
## 4 1 Gd Detchd 1998 Unf 3
## 5 1 TA Attchd 2000 RFn 3
## 6 0 <NA> Attchd 1993 Unf 2
## GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## 1 548 TA TA Y 0 61
## 2 460 TA TA Y 298 0
## 3 608 TA TA Y 0 42
## 4 642 TA TA Y 0 35
## 5 836 TA TA Y 192 84
## 6 480 TA TA Y 40 30
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature
## 1 0 0 0 0 <NA> <NA> <NA>
## 2 0 0 0 0 <NA> <NA> <NA>
## 3 0 0 0 0 <NA> <NA> <NA>
## 4 272 0 0 0 <NA> <NA> <NA>
## 5 0 0 0 0 <NA> <NA> <NA>
## 6 0 320 0 0 <NA> MnPrv Shed
## MiscVal MoSold YrSold SaleType SaleCondition SalePrice
## 1 0 2 2008 WD Normal 208500
## 2 0 5 2007 WD Normal 181500
## 3 0 9 2008 WD Normal 223500
## 4 0 2 2006 WD Abnorml 140000
## 5 0 12 2008 WD Normal 250000
## 6 700 10 2009 WD Normal 143000
summary(prices.data)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 Length:1460 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 Class :character 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 Mode :character Median : 69.00
## Mean : 730.5 Mean : 56.9 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape
## Min. : 1300 Length:1460 Length:1460 Length:1460
## 1st Qu.: 7554 Class :character Class :character Class :character
## Median : 9478 Mode :character Mode :character Mode :character
## Mean : 10517
## 3rd Qu.: 11602
## Max. :215245
##
## LandContour Utilities LotConfig LandSlope
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Neighborhood Condition1 Condition2 BldgType
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## HouseStyle OverallQual OverallCond YearBuilt
## Length:1460 Min. : 1.000 Min. :1.000 Min. :1872
## Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
## Mode :character Median : 6.000 Median :5.000 Median :1973
## Mean : 6.099 Mean :5.575 Mean :1971
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
## Max. :10.000 Max. :9.000 Max. :2010
##
## YearRemodAdd RoofStyle RoofMatl Exterior1st
## Min. :1950 Length:1460 Length:1460 Length:1460
## 1st Qu.:1967 Class :character Class :character Class :character
## Median :1994 Mode :character Mode :character Mode :character
## Mean :1985
## 3rd Qu.:2004
## Max. :2010
##
## Exterior2nd MasVnrType MasVnrArea ExterQual
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 0.0 Mode :character
## Mean : 103.7
## 3rd Qu.: 166.0
## Max. :1600.0
## NA's :8
## ExterCond Foundation BsmtQual BsmtCond
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 383.5 Mode :character
## Mean : 443.6
## 3rd Qu.: 712.2
## Max. :5644.0
##
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Length:1460
## 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 Class :character
## Median : 0.00 Median : 477.5 Median : 991.5 Mode :character
## Mean : 46.55 Mean : 567.2 Mean :1057.4
## 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2
## Max. :1474.00 Max. :2336.0 Max. :6110.0
##
## HeatingQC CentralAir Electrical X1stFlrSF
## Length:1460 Length:1460 Length:1460 Min. : 334
## Class :character Class :character Class :character 1st Qu.: 882
## Mode :character Mode :character Mode :character Median :1087
## Mean :1163
## 3rd Qu.:1391
## Max. :4692
##
## X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Min. : 0 Min. : 0.000 Min. : 334 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1130 1st Qu.:0.0000
## Median : 0 Median : 0.000 Median :1464 Median :0.0000
## Mean : 347 Mean : 5.845 Mean :1515 Mean :0.4253
## 3rd Qu.: 728 3rd Qu.: 0.000 3rd Qu.:1777 3rd Qu.:1.0000
## Max. :2065 Max. :572.000 Max. :5642 Max. :3.0000
##
## BsmtHalfBath FullBath HalfBath BedroomAbvGr
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.00000 Median :2.000 Median :0.0000 Median :3.000
## Mean :0.05753 Mean :1.565 Mean :0.3829 Mean :2.866
## 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :2.00000 Max. :3.000 Max. :2.0000 Max. :8.000
##
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.000 Length:1460 Min. : 2.000 Length:1460
## 1st Qu.:1.000 Class :character 1st Qu.: 5.000 Class :character
## Median :1.000 Mode :character Median : 6.000 Mode :character
## Mean :1.047 Mean : 6.518
## 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :3.000 Max. :14.000
##
## Fireplaces FireplaceQu GarageType GarageYrBlt
## Min. :0.000 Length:1460 Length:1460 Min. :1900
## 1st Qu.:0.000 Class :character Class :character 1st Qu.:1961
## Median :1.000 Mode :character Mode :character Median :1980
## Mean :0.613 Mean :1979
## 3rd Qu.:1.000 3rd Qu.:2002
## Max. :3.000 Max. :2010
## NA's :81
## GarageFinish GarageCars GarageArea GarageQual
## Length:1460 Min. :0.000 Min. : 0.0 Length:1460
## Class :character 1st Qu.:1.000 1st Qu.: 334.5 Class :character
## Mode :character Median :2.000 Median : 480.0 Mode :character
## Mean :1.767 Mean : 473.0
## 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :4.000 Max. :1418.0
##
## GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Length:1460 Length:1460 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.00 Median : 25.00
## Mean : 94.24 Mean : 46.66
## 3rd Qu.:168.00 3rd Qu.: 68.00
## Max. :857.00 Max. :547.00
##
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.00 Median : 0.00 Median : 0.000
## Mean : 21.95 Mean : 3.41 Mean : 15.06 Mean : 2.759
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :552.00 Max. :508.00 Max. :480.00 Max. :738.000
##
## PoolQC Fence MiscFeature MiscVal
## Length:1460 Length:1460 Length:1460 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Mode :character Median : 0.00
## Mean : 43.49
## 3rd Qu.: 0.00
## Max. :15500.00
##
## MoSold YrSold SaleType SaleCondition
## Min. : 1.000 Min. :2006 Length:1460 Length:1460
## 1st Qu.: 5.000 1st Qu.:2007 Class :character Class :character
## Median : 6.000 Median :2008 Mode :character Mode :character
## Mean : 6.322 Mean :2008
## 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :12.000 Max. :2010
##
## SalePrice
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
##
prices.data%>%
ggplot(aes(x=GrLivArea, y=SalePrice))+geom_point()+labs(title='Living Area vs Sale Price',x = 'Living Area', y = 'Sale Price')+geom_smooth(method='lm',formula=y~x)
prices.data%>%
ggplot(aes(x=GarageArea, y=SalePrice))+geom_point()+labs(title='Garage Size vs Sale Price',x = 'Garage Size', y = 'Sale Price')+geom_smooth(method='lm',formula=y~x)
#variables I want to look at
variables = c('GrLivArea', 'GarageArea', 'SalePrice')
#correlation matrix
cor_matrix = cor(prices.data[variables])
cor_matrix
## GrLivArea GarageArea SalePrice
## GrLivArea 1.0000000 0.4689975 0.7086245
## GarageArea 0.4689975 1.0000000 0.6234314
## SalePrice 0.7086245 0.6234314 1.0000000
# correlation test between Living Area and Garage Area
cor.test(prices.data$GrLivArea, prices.data$GarageArea, conf.level = .8)
##
## Pearson's product-moment correlation
##
## data: prices.data$GrLivArea and prices.data$GarageArea
## t = 20.276, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.4423993 0.4947713
## sample estimates:
## cor
## 0.4689975
# Correlation test between sale price and living area
cor.test(prices.data$GrLivArea, prices.data$SalePrice, conf.level = .8)
##
## Pearson's product-moment correlation
##
## data: prices.data$GrLivArea and prices.data$SalePrice
## t = 38.348, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.6915087 0.7249450
## sample estimates:
## cor
## 0.7086245
# correlation test between sale price and garage area
cor.test(prices.data$GarageArea, prices.data$SalePrice, conf.level = .8)
##
## Pearson's product-moment correlation
##
## data: prices.data$GarageArea and prices.data$SalePrice
## t = 30.446, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.6024756 0.6435283
## sample estimates:
## cor
## 0.6234314
# pairwise correlation tests to see if two variables correlations are 0 or not. For all of our tests, since 0 was not within the 80% confidence interval, we were unable to accept the null hypothesis.
# we should be worried about family-wise error as there are many in this dataset and, if we conducted pairwise hypothesis tests for each pair of variables, error rates compound quickly
# inverse matrix or precision matrix
inverse_cor = solve(cor_matrix)
# inverse * matrix
identity_1= round(inverse_cor%*%cor_matrix)
# matrix * inverse
identity_2= round(cor_matrix%*%inverse_cor)
# both matrices are the same
identity_1==identity_2
## GrLivArea GarageArea SalePrice
## GrLivArea TRUE TRUE TRUE
## GarageArea TRUE TRUE TRUE
## SalePrice TRUE TRUE TRUE
# multiplying a matrix by its inverse produces an identity matrix
identity_1
## GrLivArea GarageArea SalePrice
## GrLivArea 1 0 0
## GarageArea 0 1 0
## SalePrice 0 0 1
# function that performs LU decomp
LU_decomp = function(m){
count = dim(m)[1]
U = matrix(c(rep(0,count*2)),nrow=count,ncol=count)
L = matrix(c(rep(0,count*2)),nrow=count,ncol=count)
for(i in seq(count)){
L[i,i] = 1
U[i,i] = m[i,i]
for(j in seq(count)[i+1:count]){
L[j,i] = m[i,j]/U[i,i]
U[i,j] = m[j,i]
}
for(j in seq(count)[i+1:count]){
for(k in seq(count)[i+1:count]){
m[j,k]=m[j,k]-L[j,i]*U[i,k]
}
}
}
return(list(L=L,U=U))
}
# lu decomposition of correlation matrix
LU_decomp(cor_matrix)
## $L
## [,1] [,2] [,3]
## [1,] 1.0000000 0.0000000 0
## [2,] 0.4689975 1.0000000 0
## [3,] 0.7086245 0.3731704 1
##
## $U
## [,1] [,2] [,3]
## [1,] 1 0.4689975 0.7086245
## [2,] 0 0.7800414 0.2910883
## [3,] 0 0.0000000 0.3892258
fitdistr() function from MASS package
price = prices.data$SalePrice
fit = fitdistr(prices.data$SalePrice, densfun = 'exponential')
sample_fit=rexp(1000, fit$estimate)
par(mfrow=c(1,2))
# sale price is somewhat skewed to the right
hist(price, xlab='Sale Price', main='Observation')
# simulated
hist(sample_fit, xlab = 'Sale Price', main='Simulation')
# 5% and 95% quantiles for sample data
quantile(sample_fit, c(0.05,0.95))
## 5% 95%
## 11128.84 537083.29
# 95% confidence interval assuming normality
z = 1.96
m = mean(price)
sd = sd(price)
n = length(price)
ci = c(m-z*(sd/sqrt(n)), m+z*(sd/sqrt(n)))
ci
## [1] 176846.1 184996.2
# 5% and 95% quantiles for observed data
quantile(price, c(0.05,0.95))
## 5% 95%
## 88000 326100
# the sample data overestimates the observed prices while an assumption of normality underestimates. This shows that the data is not completely right skewed but also not completely normal
# 70% train test split
size = dim(prices.data)[1]
set.seed(1111)
training = sample(seq(size),size = round(size*.7))
prices.train = prices.data[training,]
prices.test = prices.data[-training,]
prices.model = lm(SalePrice~Neighborhood+GrLivArea+GarageArea+MSSubClass+YearBuilt,
data = prices.train)
summary(prices.model)
##
## Call:
## lm(formula = SalePrice ~ Neighborhood + GrLivArea + GarageArea +
## MSSubClass + YearBuilt, data = prices.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -364914 -17450 -1908 13842 281669
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.332e+06 1.696e+05 -7.855 1.03e-14 ***
## NeighborhoodBlueste -2.100e+04 3.947e+04 -0.532 0.594771
## NeighborhoodBrDale -2.667e+04 1.642e+04 -1.625 0.104543
## NeighborhoodBrkSide -1.718e+04 1.431e+04 -1.200 0.230257
## NeighborhoodClearCr -1.133e+03 1.472e+04 -0.077 0.938685
## NeighborhoodCollgCr -2.008e+04 1.217e+04 -1.650 0.099172 .
## NeighborhoodCrawfor 1.870e+04 1.444e+04 1.295 0.195584
## NeighborhoodEdwards -3.718e+04 1.301e+04 -2.858 0.004352 **
## NeighborhoodGilbert -2.903e+04 1.254e+04 -2.315 0.020790 *
## NeighborhoodIDOTRR -2.840e+04 1.536e+04 -1.849 0.064790 .
## NeighborhoodMeadowV -2.234e+04 1.534e+04 -1.456 0.145580
## NeighborhoodMitchel -3.028e+04 1.320e+04 -2.294 0.022001 *
## NeighborhoodNAmes -3.093e+04 1.262e+04 -2.450 0.014443 *
## NeighborhoodNoRidge 4.340e+04 1.376e+04 3.155 0.001656 **
## NeighborhoodNPkVill -6.915e+03 1.933e+04 -0.358 0.720542
## NeighborhoodNridgHt 4.698e+04 1.283e+04 3.662 0.000263 ***
## NeighborhoodNWAmes -3.237e+04 1.304e+04 -2.483 0.013176 *
## NeighborhoodOldTown -2.042e+04 1.419e+04 -1.439 0.150418
## NeighborhoodSawyer -3.035e+04 1.320e+04 -2.299 0.021714 *
## NeighborhoodSawyerW -2.702e+04 1.287e+04 -2.099 0.036071 *
## NeighborhoodSomerst -9.761e+00 1.237e+04 -0.001 0.999371
## NeighborhoodStoneBr 7.051e+04 1.438e+04 4.905 1.09e-06 ***
## NeighborhoodSWISU -2.363e+04 1.583e+04 -1.493 0.135722
## NeighborhoodTimber -1.884e+03 1.411e+04 -0.134 0.893786
## NeighborhoodVeenker 4.677e+04 1.712e+04 2.732 0.006406 **
## GrLivArea 7.312e+01 2.882e+00 25.371 < 2e-16 ***
## GarageArea 4.296e+01 7.427e+00 5.784 9.75e-09 ***
## MSSubClass -2.976e+02 3.322e+01 -8.960 < 2e-16 ***
## YearBuilt 7.163e+02 8.450e+01 8.478 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 37700 on 993 degrees of freedom
## Multiple R-squared: 0.7727, Adjusted R-squared: 0.7663
## F-statistic: 120.6 on 28 and 993 DF, p-value: < 2.2e-16
# testing model on test set
predictions = predict(prices.model,prices.test)
# how well the model fits the test data (R^2)
cor(predictions,prices.test$SalePrice)^2
## [1] 0.795895
# replace na's for Garage Area variable
prices.eval$GarageArea = prices.eval$GarageArea%>%
replace_na(0)
# predictions on evaluation set
prices.eval$SalePrice = predict(prices.model, prices.eval)
# create submission file for kaggle
submission = prices.eval[c('Id','SalePrice')]
head(submission)
## Id SalePrice
## 1 1461 132807.5
## 2 1462 144360.8
## 3 1463 191540.8
## 4 1464 189913.6
## 5 1465 245150.2
## 6 1466 188772.1
write.csv(submission, file = "submission.csv", row.names = FALSE)
#kaggle user: schoolboyrich
#kaggle score: 0.20458