library(psych)
## Warning: package 'psych' was built under R version 4.0.5
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.0.5
## corrplot 0.88 loaded
library(matrixcalc)
library(MASS)
## Warning: package 'MASS' was built under R version 4.0.4
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.0.5
n = 10000
N = 6
x = runif(n,min=1,max=N)
y = rnorm(n,mean = ((N+1)/2),sd = ((N+1)/2))
prob_table = data.frame(x,y)
x_median = median(prob_table$x)
y_quartile = quantile(prob_table$y,.25)
p_a = nrow((subset(prob_table,prob_table$x>y_quartile & prob_table$x>x_median)))/
nrow((subset(prob_table,prob_table$x>y_quartile)))
print(paste("the probability for question 1a is ",p_a))
## [1] "the probability for question 1a is 0.513980263157895"
p_b = nrow((subset(prob_table,prob_table$x>x_median & prob_table$y>y_quartile)))/n
print(paste("the probability for question 1b is ",p_b))
## [1] "the probability for question 1b is 0.3772"
p_c = nrow((subset(prob_table,prob_table$x<x_median & prob_table$x>y_quartile)))/
nrow((subset(prob_table,prob_table$x>y_quartile)))
print(paste("the probability for question 1c is ",p_c))
## [1] "the probability for question 1c is 0.486019736842105"
probs_v = c(nrow(subset(prob_table,prob_table$x>x_median & prob_table$y>y_quartile))/n,
nrow(subset(prob_table,prob_table$x<=x_median & prob_table$y>y_quartile))/n,
nrow(subset(prob_table,prob_table$x>x_median & prob_table$y<=y_quartile))/n,
nrow(subset(prob_table,prob_table$x<=x_median & prob_table$y<=y_quartile))/n)
prob_mat = matrix(probs_v,2,2)
rownames(prob_mat) = c("x>x_median","x<=x_median")
colnames(prob_mat) = c("y>y_quartile","y<=y_quartile")
print(prob_mat)
## y>y_quartile y<=y_quartile
## x>x_median 0.3772 0.1228
## x<=x_median 0.3728 0.1272
print(identical(round(prob_mat[1],2),round((nrow(subset(prob_table,prob_table$x>x_median))/n)*
(nrow(subset(prob_table,prob_table$y>y_quartile))/n),2)))
## [1] TRUE
count_v =c(nrow(subset(prob_table,prob_table$x>x_median & prob_table$y>y_quartile)),
nrow(subset(prob_table,prob_table$x<=x_median & prob_table$y>y_quartile)),
nrow(subset(prob_table,prob_table$x>x_median & prob_table$y<=y_quartile)),
nrow(subset(prob_table,prob_table$x<=x_median & prob_table$y<=y_quartile)))
print(count_v)
## [1] 3772 3728 1228 1272
print(chisq.test(matrix(count_v,2,2))$p.value)
## [1] 0.3206893
print(fisher.test(matrix(count_v,2,2),simulate.p.value = TRUE)$p.value)
## [1] 0.3206894
P-values are nearly identical. Given that neither is below .05 we cannot reject the null hypothesis and thus confirm independence.
The chi-squared test applies an approximation assuming the sample is large. The Fisher’s exact test runs an exact procedure especially for small-sized samples. In this case the chi-squared is more appropriate given the size of our sample set.
print(summary(train_set))
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 Length:1460 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 Class :character 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 Mode :character Median : 69.00
## Mean : 730.5 Mean : 56.9 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape
## Min. : 1300 Length:1460 Length:1460 Length:1460
## 1st Qu.: 7554 Class :character Class :character Class :character
## Median : 9478 Mode :character Mode :character Mode :character
## Mean : 10517
## 3rd Qu.: 11602
## Max. :215245
##
## LandContour Utilities LotConfig LandSlope
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Neighborhood Condition1 Condition2 BldgType
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## HouseStyle OverallQual OverallCond YearBuilt
## Length:1460 Min. : 1.000 Min. :1.000 Min. :1872
## Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
## Mode :character Median : 6.000 Median :5.000 Median :1973
## Mean : 6.099 Mean :5.575 Mean :1971
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
## Max. :10.000 Max. :9.000 Max. :2010
##
## YearRemodAdd RoofStyle RoofMatl Exterior1st
## Min. :1950 Length:1460 Length:1460 Length:1460
## 1st Qu.:1967 Class :character Class :character Class :character
## Median :1994 Mode :character Mode :character Mode :character
## Mean :1985
## 3rd Qu.:2004
## Max. :2010
##
## Exterior2nd MasVnrType MasVnrArea ExterQual
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 0.0 Mode :character
## Mean : 103.7
## 3rd Qu.: 166.0
## Max. :1600.0
## NA's :8
## ExterCond Foundation BsmtQual BsmtCond
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 383.5 Mode :character
## Mean : 443.6
## 3rd Qu.: 712.2
## Max. :5644.0
##
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Length:1460
## 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 Class :character
## Median : 0.00 Median : 477.5 Median : 991.5 Mode :character
## Mean : 46.55 Mean : 567.2 Mean :1057.4
## 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2
## Max. :1474.00 Max. :2336.0 Max. :6110.0
##
## HeatingQC CentralAir Electrical X1stFlrSF
## Length:1460 Length:1460 Length:1460 Min. : 334
## Class :character Class :character Class :character 1st Qu.: 882
## Mode :character Mode :character Mode :character Median :1087
## Mean :1163
## 3rd Qu.:1391
## Max. :4692
##
## X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Min. : 0 Min. : 0.000 Min. : 334 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1130 1st Qu.:0.0000
## Median : 0 Median : 0.000 Median :1464 Median :0.0000
## Mean : 347 Mean : 5.845 Mean :1515 Mean :0.4253
## 3rd Qu.: 728 3rd Qu.: 0.000 3rd Qu.:1777 3rd Qu.:1.0000
## Max. :2065 Max. :572.000 Max. :5642 Max. :3.0000
##
## BsmtHalfBath FullBath HalfBath BedroomAbvGr
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.00000 Median :2.000 Median :0.0000 Median :3.000
## Mean :0.05753 Mean :1.565 Mean :0.3829 Mean :2.866
## 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :2.00000 Max. :3.000 Max. :2.0000 Max. :8.000
##
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.000 Length:1460 Min. : 2.000 Length:1460
## 1st Qu.:1.000 Class :character 1st Qu.: 5.000 Class :character
## Median :1.000 Mode :character Median : 6.000 Mode :character
## Mean :1.047 Mean : 6.518
## 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :3.000 Max. :14.000
##
## Fireplaces FireplaceQu GarageType GarageYrBlt
## Min. :0.000 Length:1460 Length:1460 Min. :1900
## 1st Qu.:0.000 Class :character Class :character 1st Qu.:1961
## Median :1.000 Mode :character Mode :character Median :1980
## Mean :0.613 Mean :1979
## 3rd Qu.:1.000 3rd Qu.:2002
## Max. :3.000 Max. :2010
## NA's :81
## GarageFinish GarageCars GarageArea GarageQual
## Length:1460 Min. :0.000 Min. : 0.0 Length:1460
## Class :character 1st Qu.:1.000 1st Qu.: 334.5 Class :character
## Mode :character Median :2.000 Median : 480.0 Mode :character
## Mean :1.767 Mean : 473.0
## 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :4.000 Max. :1418.0
##
## GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Length:1460 Length:1460 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.00 Median : 25.00
## Mean : 94.24 Mean : 46.66
## 3rd Qu.:168.00 3rd Qu.: 68.00
## Max. :857.00 Max. :547.00
##
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.00 Median : 0.00 Median : 0.000
## Mean : 21.95 Mean : 3.41 Mean : 15.06 Mean : 2.759
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :552.00 Max. :508.00 Max. :480.00 Max. :738.000
##
## PoolQC Fence MiscFeature MiscVal
## Length:1460 Length:1460 Length:1460 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Mode :character Median : 0.00
## Mean : 43.49
## 3rd Qu.: 0.00
## Max. :15500.00
##
## MoSold YrSold SaleType SaleCondition
## Min. : 1.000 Min. :2006 Length:1460 Length:1460
## 1st Qu.: 5.000 1st Qu.:2007 Class :character Class :character
## Median : 6.000 Median :2008 Mode :character Mode :character
## Mean : 6.322 Mean :2008
## 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :12.000 Max. :2010
##
## SalePrice
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
##
print(colnames(train_set))
## [1] "Id" "MSSubClass" "MSZoning" "LotFrontage"
## [5] "LotArea" "Street" "Alley" "LotShape"
## [9] "LandContour" "Utilities" "LotConfig" "LandSlope"
## [13] "Neighborhood" "Condition1" "Condition2" "BldgType"
## [17] "HouseStyle" "OverallQual" "OverallCond" "YearBuilt"
## [21] "YearRemodAdd" "RoofStyle" "RoofMatl" "Exterior1st"
## [25] "Exterior2nd" "MasVnrType" "MasVnrArea" "ExterQual"
## [29] "ExterCond" "Foundation" "BsmtQual" "BsmtCond"
## [33] "BsmtExposure" "BsmtFinType1" "BsmtFinSF1" "BsmtFinType2"
## [37] "BsmtFinSF2" "BsmtUnfSF" "TotalBsmtSF" "Heating"
## [41] "HeatingQC" "CentralAir" "Electrical" "X1stFlrSF"
## [45] "X2ndFlrSF" "LowQualFinSF" "GrLivArea" "BsmtFullBath"
## [49] "BsmtHalfBath" "FullBath" "HalfBath" "BedroomAbvGr"
## [53] "KitchenAbvGr" "KitchenQual" "TotRmsAbvGrd" "Functional"
## [57] "Fireplaces" "FireplaceQu" "GarageType" "GarageYrBlt"
## [61] "GarageFinish" "GarageCars" "GarageArea" "GarageQual"
## [65] "GarageCond" "PavedDrive" "WoodDeckSF" "OpenPorchSF"
## [69] "EnclosedPorch" "X3SsnPorch" "ScreenPorch" "PoolArea"
## [73] "PoolQC" "Fence" "MiscFeature" "MiscVal"
## [77] "MoSold" "YrSold" "SaleType" "SaleCondition"
## [81] "SalePrice"
dep_var = train_set$SalePrice
ind_var1 = train_set$LotArea
ind_var2 = train_set$GarageArea
print(summary(dep_var))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34900 129975 163000 180921 214000 755000
print(summary(ind_var1))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1300 7554 9478 10517 11602 215245
print(summary(ind_var2))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 334.5 480.0 473.0 576.0 1418.0
print(describe(dep_var))
## vars n mean sd median trimmed mad min max range skew
## X1 1 1460 180921.2 79442.5 163000 170783.3 56338.8 34900 755000 720100 1.88
## kurtosis se
## X1 6.5 2079.11
print(describe(ind_var1))
## vars n mean sd median trimmed mad min max range skew
## X1 1 1460 10516.83 9981.26 9478.5 9563.28 2962.23 1300 215245 213945 12.18
## kurtosis se
## X1 202.26 261.22
print(describe(ind_var2))
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 1460 472.98 213.8 480 469.81 177.91 0 1418 1418 0.18 0.9
## se
## X1 5.6
par(mfrow=c(1,1))
plot(dep_var)
plot(ind_var1)
plot(ind_var2)
par(mfrow=c(1,2))
boxplot(dep_var, main="SalesPrice BoxPlot")
hist(dep_var, breaks = 20, main = "SalesPrice Histogram")
boxplot(ind_var1, main="LotArea BoxPlot")
hist(ind_var1, breaks = 20, main = "LotArea Histogram")
boxplot(ind_var2, main="GarageArea BoxPlot")
hist(ind_var2, breaks = 20, main = "GarageArea Histogram")
plot(ind_var1~dep_var)
plot(ind_var2~dep_var)
cor(train_set[,c('SalePrice','LotArea','GarageArea')])
## SalePrice LotArea GarageArea
## SalePrice 1.0000000 0.2638434 0.6234314
## LotArea 0.2638434 1.0000000 0.1804028
## GarageArea 0.6234314 0.1804028 1.0000000
par(mfrow=c(1,1))
corrplot(cor(train_set[,c('SalePrice','LotArea','GarageArea')]),method='circle')
cor.test(train_set$SalePrice,train_set$LotArea, method = 'pearson', conf.level = 0.80)
##
## Pearson's product-moment correlation
##
## data: train_set$SalePrice and train_set$LotArea
## t = 10.445, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.2323391 0.2947946
## sample estimates:
## cor
## 0.2638434
cor.test(train_set$SalePrice,train_set$GarageArea, method = 'pearson', conf.level = 0.80)
##
## Pearson's product-moment correlation
##
## data: train_set$SalePrice and train_set$GarageArea
## t = 30.446, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.6024756 0.6435283
## sample estimates:
## cor
## 0.6234314
cor.test(train_set$LotArea,train_set$GarageArea, method = 'pearson', conf.level = 0.80)
##
## Pearson's product-moment correlation
##
## data: train_set$LotArea and train_set$GarageArea
## t = 7.0034, df = 1458, p-value = 3.803e-12
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.1477356 0.2126767
## sample estimates:
## cor
## 0.1804028
Given that the p-values are all statistically significant we can reject the null hypothesis. This indicates that true correlation is not equal to 0. This sets the stage for familywise errors as correlations will most likely exist across the dataset and we can falsely derive relationships that may not exist.
cormat = cor(train_set[,c('SalePrice','LotArea','GarageArea')])
inversecormat = solve(cormat)
print(cormat)
## SalePrice LotArea GarageArea
## SalePrice 1.0000000 0.2638434 0.6234314
## LotArea 0.2638434 1.0000000 0.1804028
## GarageArea 0.6234314 0.1804028 1.0000000
print(inversecormat)
## SalePrice LotArea GarageArea
## SalePrice 1.7016986 -0.26625940 -1.01285847
## LotArea -0.2662594 1.07530074 -0.02799273
## GarageArea -1.0128585 -0.02799273 1.63649778
print(cormat %*% inversecormat)
## SalePrice LotArea GarageArea
## SalePrice 1 2.428613e-17 0
## LotArea 0 1.000000e+00 0
## GarageArea 0 3.469447e-17 1
print(inversecormat %*% cormat)
## SalePrice LotArea GarageArea
## SalePrice 1.000000e+00 -5.551115e-17 0.000000e+00
## LotArea 7.979728e-17 1.000000e+00 6.591949e-17
## GarageArea 0.000000e+00 0.000000e+00 1.000000e+00
print(matrixcalc::lu.decomposition(cormat)$L)
## [,1] [,2] [,3]
## [1,] 1.0000000 0.00000000 0
## [2,] 0.2638434 1.00000000 0
## [3,] 0.6234314 0.01710527 1
print(matrixcalc::lu.decomposition(cormat)$U)
## [,1] [,2] [,3]
## [1,] 1 0.2638434 0.62343144
## [2,] 0 0.9303867 0.01591451
## [3,] 0 0.0000000 0.61106102
print(matrixcalc::lu.decomposition(cormat)$L %*% matrixcalc::lu.decomposition(cormat)$U==cormat)
## SalePrice LotArea GarageArea
## SalePrice TRUE TRUE TRUE
## LotArea TRUE TRUE TRUE
## GarageArea TRUE TRUE TRUE
print(matrixcalc::lu.decomposition(inversecormat)$L %*% matrixcalc::lu.decomposition(inversecormat)$U)
## [,1] [,2] [,3]
## [1,] 1.7016986 -0.26625940 -1.01285847
## [2,] -0.2662594 1.07530074 -0.02799273
## [3,] -1.0128585 -0.02799273 1.63649778
paste0("minimum value is below 0: ",min(ind_var2)<0)
## [1] "minimum value is below 0: FALSE"
exp_fit_dist_cacl = MASS::fitdistr(ind_var2,densfun = 'exponential')
exp_sample = rexp(1000,exp_fit_dist_cacl$estimate)
par(mfrow=c(1,2))
hist(ind_var2)
hist(exp_sample)
quantile(exp_sample,probs = c(.05,.95))
## 5% 95%
## 21.09845 1472.69634
mu_indvar2 = mean(ind_var2)
sigma_indvar2 = sd(ind_var2)
count_indvar2 = length(ind_var2)
standard_error = qnorm(.95) * sigma_indvar2/sqrt(count_indvar2)
print(paste0("confidence interval 95% range: ",mu_indvar2-standard_error," to ",mu_indvar2+standard_error))
## [1] "confidence interval 95% range: 463.776311738512 to 482.18396223409"
print(quantile(ind_var2,c(.05,.95)))
## 5% 95%
## 0.0 850.1
for (i in colnames(train_set)){
if((sum(is.na(train_set[i]))/nrow(train_set))>=.5){
train_set[i] = NULL
}}
summary(train_set)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 Length:1460 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 Class :character 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 Mode :character Median : 69.00
## Mean : 730.5 Mean : 56.9 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## LotArea Street LotShape LandContour
## Min. : 1300 Length:1460 Length:1460 Length:1460
## 1st Qu.: 7554 Class :character Class :character Class :character
## Median : 9478 Mode :character Mode :character Mode :character
## Mean : 10517
## 3rd Qu.: 11602
## Max. :215245
##
## Utilities LotConfig LandSlope Neighborhood
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Condition1 Condition2 BldgType HouseStyle
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## OverallQual OverallCond YearBuilt YearRemodAdd
## Min. : 1.000 Min. :1.000 Min. :1872 Min. :1950
## 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954 1st Qu.:1967
## Median : 6.000 Median :5.000 Median :1973 Median :1994
## Mean : 6.099 Mean :5.575 Mean :1971 Mean :1985
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000 3rd Qu.:2004
## Max. :10.000 Max. :9.000 Max. :2010 Max. :2010
##
## RoofStyle RoofMatl Exterior1st Exterior2nd
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## MasVnrType MasVnrArea ExterQual ExterCond
## Length:1460 Min. : 0.0 Length:1460 Length:1460
## Class :character 1st Qu.: 0.0 Class :character Class :character
## Mode :character Median : 0.0 Mode :character Mode :character
## Mean : 103.7
## 3rd Qu.: 166.0
## Max. :1600.0
## NA's :8
## Foundation BsmtQual BsmtCond BsmtExposure
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2
## Length:1460 Min. : 0.0 Length:1460 Min. : 0.00
## Class :character 1st Qu.: 0.0 Class :character 1st Qu.: 0.00
## Mode :character Median : 383.5 Mode :character Median : 0.00
## Mean : 443.6 Mean : 46.55
## 3rd Qu.: 712.2 3rd Qu.: 0.00
## Max. :5644.0 Max. :1474.00
##
## BsmtUnfSF TotalBsmtSF Heating HeatingQC
## Min. : 0.0 Min. : 0.0 Length:1460 Length:1460
## 1st Qu.: 223.0 1st Qu.: 795.8 Class :character Class :character
## Median : 477.5 Median : 991.5 Mode :character Mode :character
## Mean : 567.2 Mean :1057.4
## 3rd Qu.: 808.0 3rd Qu.:1298.2
## Max. :2336.0 Max. :6110.0
##
## CentralAir Electrical X1stFlrSF X2ndFlrSF
## Length:1460 Length:1460 Min. : 334 Min. : 0
## Class :character Class :character 1st Qu.: 882 1st Qu.: 0
## Mode :character Mode :character Median :1087 Median : 0
## Mean :1163 Mean : 347
## 3rd Qu.:1391 3rd Qu.: 728
## Max. :4692 Max. :2065
##
## LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath
## Min. : 0.000 Min. : 334 Min. :0.0000 Min. :0.00000
## 1st Qu.: 0.000 1st Qu.:1130 1st Qu.:0.0000 1st Qu.:0.00000
## Median : 0.000 Median :1464 Median :0.0000 Median :0.00000
## Mean : 5.845 Mean :1515 Mean :0.4253 Mean :0.05753
## 3rd Qu.: 0.000 3rd Qu.:1777 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. :572.000 Max. :5642 Max. :3.0000 Max. :2.00000
##
## FullBath HalfBath BedroomAbvGr KitchenAbvGr
## Min. :0.000 Min. :0.0000 Min. :0.000 Min. :0.000
## 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:1.000
## Median :2.000 Median :0.0000 Median :3.000 Median :1.000
## Mean :1.565 Mean :0.3829 Mean :2.866 Mean :1.047
## 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:1.000
## Max. :3.000 Max. :2.0000 Max. :8.000 Max. :3.000
##
## KitchenQual TotRmsAbvGrd Functional Fireplaces
## Length:1460 Min. : 2.000 Length:1460 Min. :0.000
## Class :character 1st Qu.: 5.000 Class :character 1st Qu.:0.000
## Mode :character Median : 6.000 Mode :character Median :1.000
## Mean : 6.518 Mean :0.613
## 3rd Qu.: 7.000 3rd Qu.:1.000
## Max. :14.000 Max. :3.000
##
## FireplaceQu GarageType GarageYrBlt GarageFinish
## Length:1460 Length:1460 Min. :1900 Length:1460
## Class :character Class :character 1st Qu.:1961 Class :character
## Mode :character Mode :character Median :1980 Mode :character
## Mean :1979
## 3rd Qu.:2002
## Max. :2010
## NA's :81
## GarageCars GarageArea GarageQual GarageCond
## Min. :0.000 Min. : 0.0 Length:1460 Length:1460
## 1st Qu.:1.000 1st Qu.: 334.5 Class :character Class :character
## Median :2.000 Median : 480.0 Mode :character Mode :character
## Mean :1.767 Mean : 473.0
## 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :4.000 Max. :1418.0
##
## PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch
## Length:1460 Min. : 0.00 Min. : 0.00 Min. : 0.00
## Class :character 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Median : 0.00 Median : 25.00 Median : 0.00
## Mean : 94.24 Mean : 46.66 Mean : 21.95
## 3rd Qu.:168.00 3rd Qu.: 68.00 3rd Qu.: 0.00
## Max. :857.00 Max. :547.00 Max. :552.00
##
## X3SsnPorch ScreenPorch PoolArea MiscVal
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.00
## Median : 0.00 Median : 0.00 Median : 0.000 Median : 0.00
## Mean : 3.41 Mean : 15.06 Mean : 2.759 Mean : 43.49
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000 3rd Qu.: 0.00
## Max. :508.00 Max. :480.00 Max. :738.000 Max. :15500.00
##
## MoSold YrSold SaleType SaleCondition
## Min. : 1.000 Min. :2006 Length:1460 Length:1460
## 1st Qu.: 5.000 1st Qu.:2007 Class :character Class :character
## Median : 6.000 Median :2008 Mode :character Mode :character
## Mean : 6.322 Mean :2008
## 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :12.000 Max. :2010
##
## SalePrice
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
##
clean_training_data = train_set[, !sapply(train_set, is.character)]
clean_training_data = na.omit(clean_training_data)
## additional cleaning
clean_training_data$TotalBsmtSF=NULL
clean_training_data$GrLivArea=NULL
clean_training_data$OpenPorchSF=NULL
clean_training_data$YrSold=NULL
clean_training_data$HalfBath=NULL
clean_training_data$EnclosedPorch=NULL
clean_training_data$BsmtHalfBath=NULL
multi_reg = lm(SalePrice~.,data=clean_training_data)
summary(multi_reg)
##
## Call:
## lm(formula = SalePrice ~ ., data = clean_training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -443741 -17119 -2425 15105 317646
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.012e+05 1.628e+05 -4.922 9.91e-07 ***
## Id -1.243e+00 2.652e+00 -0.469 0.639359
## MSSubClass -1.993e+02 3.434e+01 -5.803 8.52e-09 ***
## LotFrontage -1.159e+02 6.098e+01 -1.900 0.057695 .
## LotArea 5.394e-01 1.568e-01 3.439 0.000605 ***
## OverallQual 1.871e+04 1.468e+03 12.738 < 2e-16 ***
## OverallCond 5.204e+03 1.343e+03 3.874 0.000113 ***
## YearBuilt 3.015e+02 8.023e+01 3.758 0.000180 ***
## YearRemodAdd 1.214e+02 8.599e+01 1.412 0.158268
## MasVnrArea 3.141e+01 6.979e+00 4.501 7.48e-06 ***
## BsmtFinSF1 1.761e+01 5.755e+00 3.059 0.002273 **
## BsmtFinSF2 8.551e+00 8.654e+00 0.988 0.323307
## BsmtUnfSF 4.882e+00 5.225e+00 0.934 0.350342
## X1stFlrSF 4.584e+01 7.337e+00 6.248 5.97e-10 ***
## X2ndFlrSF 4.556e+01 5.291e+00 8.610 < 2e-16 ***
## LowQualFinSF 3.331e+01 2.784e+01 1.197 0.231753
## BsmtFullBath 8.660e+03 3.044e+03 2.845 0.004527 **
## FullBath 5.714e+03 3.186e+03 1.794 0.073138 .
## BedroomAbvGr -1.006e+04 2.138e+03 -4.708 2.82e-06 ***
## KitchenAbvGr -2.212e+04 6.674e+03 -3.315 0.000947 ***
## TotRmsAbvGrd 5.400e+03 1.477e+03 3.655 0.000270 ***
## Fireplaces 4.353e+03 2.172e+03 2.004 0.045340 *
## GarageYrBlt -5.013e+01 9.069e+01 -0.553 0.580509
## GarageCars 1.692e+04 3.456e+03 4.895 1.13e-06 ***
## GarageArea 6.461e+00 1.202e+01 0.538 0.590949
## WoodDeckSF 2.158e+01 9.935e+00 2.172 0.030092 *
## X3SsnPorch 3.359e+01 3.735e+01 0.899 0.368712
## ScreenPorch 5.609e+01 2.008e+01 2.793 0.005315 **
## PoolArea -5.846e+01 2.962e+01 -1.973 0.048709 *
## MiscVal -3.679e+00 6.911e+00 -0.532 0.594639
## MoSold -2.064e+02 4.143e+02 -0.498 0.618437
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36730 on 1090 degrees of freedom
## Multiple R-squared: 0.8094, Adjusted R-squared: 0.8042
## F-statistic: 154.3 on 30 and 1090 DF, p-value: < 2.2e-16
plot(fitted(multi_reg),resid(multi_reg))
qqnorm(resid(multi_reg))
qqline(resid(multi_reg))
prediction_data = test_set
prediction_data$SalePrice = predict(multi_reg,test_set)
plot(prediction_data$SalePrice)
plot(train_set$SalePrice)
kaggle_df = data.frame(test_set$Id,prediction_data$SalePrice)
kaggle_df= kaggle_df %>% fill(prediction_data.SalePrice)
#write.csv(kaggle_df, file = "final_Model_Prediction_Project.csv", row.names = FALSE)