We are going to pick the total square feet of the basement to see how that relates to the home’s price.
library(MASS)
library(knitr)
library(dplyr)
library(ggplot2)
library(reshape2)houses = read.csv("/Users/admin/Documents/Data 605/train.csv", header = TRUE)
houses_test = read.csv("/Users/admin/Documents/Data 605/test.csv", header = TRUE)
ggplot(houses, aes(x = TotalBsmtSF) )+
geom_density(fill = "5")X<-houses$TotalBsmtSF
Y<-houses$SalePrice
plot(X,Y, col="green", main="Total SqFt of Basement and Sale Price", xlab = "Total SqFt of Basement", ylab="Sale Price")
abline(lm(Y~X), col="yellow", lwd=3) summary(Y)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34900 129975 163000 180921 214000 755000
#1st quartile of Total Basement SqFt
xQ1 <- quantile(X, 0.25)
xQ1## 25%
## 795.75
#1st quartile of Sale Price
yQ1 <- quantile(Y, 0.25)
yQ1## 25%
## 129975
n<-(nrow(houses))
nyQ1<-nrow(subset(houses,as.numeric(Y)>yQ1))
P_a<-nrow(subset(houses, as.numeric(X) > xQ1 & as.numeric(Y)>yQ1))/nyQ1
P_a## [1] 0.830137
P_b<-nrow(subset(houses, as.numeric(X) > xQ1 & as.numeric(Y)>yQ1))/n
P_b## [1] 0.6226027
P_c<-nrow(subset(houses, as.numeric(X) < xQ1 & as.numeric(Y)>yQ1))/nyQ1
P_c## [1] 0.169863
c1<-nrow(subset(houses, as.numeric(X) <=xQ1 & as.numeric(Y)<=yQ1))/n
c2<-nrow(subset(houses, as.numeric(X) <=xQ1 & as.numeric(Y)>yQ1))/n
c3<-c1+c2
c4<-nrow(subset(houses, as.numeric(X) >xQ1 & as.numeric(Y)<=yQ1))/n
c5<-nrow(subset(houses, as.numeric(X) >xQ1 & as.numeric(Y)>yQ1))/n
c6<-c4+c5
c7<-c1+c4
c8<-c2+c5
c9<-c3+c6
total<-matrix(round(c(c1,c2,c3,c4,c5,c6,c7,c8,c9),3), ncol=3, nrow=3, byrow=TRUE)
colnames(total)<-c(
"<=1st quartile",
">1st quartile",
"Total")
rownames(total)<-c("<=1st quartile",">1st quartile","Total")
print(as.table(total))## <=1st quartile >1st quartile Total
## <=1st quartile 0.123 0.127 0.250
## >1st quartile 0.127 0.623 0.750
## Total 0.250 0.750 1.000
print(paste("P(A)*p(B)=",round(c4*c5,3)))## [1] "P(A)*p(B)= 0.079"
print(paste("P(A|B)=",round(nrow(subset(houses, as.numeric(X) > xQ1 & as.numeric(Y)>yQ1))/nyQ1,3)))## [1] "P(A|B)= 0.83"
print("P(A|B) != P(A)*p(B)")## [1] "P(A|B) != P(A)*p(B)"
chi<- table(X, Y)
chisq.test(chi)## Warning in chisq.test(chi): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: chi
## X-squared = 509710, df = 476640, p-value < 2.2e-16
p-value < 2.2e-16 so we can ignore the hypothesis that the variables are independent. There is a dependence between total basement square feet and sales price
ggplot(houses, aes(x = TotalBsmtSF) )+
geom_density(fill = "5")plot(X,Y, col="green", main="Total SqFt of Basement and Sale Price", xlab = "Total SqFt of Basement", ylab="Sale Price")
abline(lm(Y~X), col="yellow", lwd=3) cor.test(X, Y, conf.level = 0.99)##
## Pearson's product-moment correlation
##
## data: X and Y
## t = 29.671, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
## 0.5697562 0.6539251
## sample estimates:
## cor
## 0.6135806
t.test(X,Y,conf.level = 0.99)##
## Welch Two Sample t-test
##
## data: X and Y
## t = -86.509, df = 1459.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 99 percent confidence interval:
## -185226.3 -174501.2
## sample estimates:
## mean of x mean of y
## 1057.429 180921.196
Looking at the results we see that the mean sale price is between $175K and $185K. The p value is small so we can conclude that there is a relationship between the total basement sq ft and the sale price.
cor_matrix <- cor(select(houses, TotalBsmtSF, SalePrice))
cor_matrix## TotalBsmtSF SalePrice
## TotalBsmtSF 1.0000000 0.6135806
## SalePrice 0.6135806 1.0000000
inv_cor_matrix <- ginv(cor_matrix)
inv_cor_matrix## [,1] [,2]
## [1,] 1.6038006 -0.9840609
## [2,] -0.9840609 1.6038006
cor_matrix %*% inv_cor_matrix## [,1] [,2]
## TotalBsmtSF 1.000000e+00 5.551115e-16
## SalePrice -2.220446e-16 1.000000e+00
inv_cor_matrix %*% cor_matrix## TotalBsmtSF SalePrice
## [1,] 1.000000e+00 0
## [2,] 3.330669e-16 1
The indentity matrix is returned for both.
min_sqft = min(houses$TotalBsmtSF)+.00001
fit <- fitdistr(X, "exponential")
lambda <- fit$estimate
sampledf <- rexp(1000, lambda)
sampledf<-data.frame(as.numeric(sampledf))
colnames(sampledf)[1] <- "sample"
hist(sampledf$sample, main="Histogram of Exponential Distribution", xlab = "Basement SqFt", breaks=30)print(paste("5th percentile=",qexp(.05,rate = lambda)))## [1] "5th percentile= 54.2390401783128"
print(paste("95th percentile=",qexp(.95,rate = lambda)))## [1] "95th percentile= 3167.77553652706"
quantile(X, .05)## 5%
## 519.3
quantile(X, .95)## 95%
## 1753
#Make a dataframe with only numeric houses data
houses_num = dplyr::select_if(houses, is.numeric)
#Get correlation from houses data
houeses_cor<-cor(houses_num)
houses_correlation = na.omit(melt(houeses_cor))
#Get fields with the top 6 sales correlation (ignore saless price)
sale_cor = houses_correlation[which(houses_correlation$Var1 == 'SalePrice'),]
head(sale_cor[order(-abs(sale_cor$value)),],6)## Var1 Var2 value
## 1444 SalePrice SalePrice 1.0000000
## 190 SalePrice OverallQual 0.7909816
## 646 SalePrice GrLivArea 0.7086245
## 1026 SalePrice GarageCars 0.6404092
## 1064 SalePrice GarageArea 0.6234314
## 494 SalePrice TotalBsmtSF 0.6135806
#Fit Model with the top 5
fit <- lm(houses$SalePrice ~ houses$OverallQual + houses$GrLivArea + houses$GarageCars + houses$GarageArea +houses$TotalBsmtSF, data=houses)
summary(fit)##
## Call:
## lm(formula = houses$SalePrice ~ houses$OverallQual + houses$GrLivArea +
## houses$GarageCars + houses$GarageArea + houses$TotalBsmtSF,
## data = houses)
##
## Residuals:
## Min 1Q Median 3Q Max
## -478977 -19915 -1503 16701 287132
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -99072.050 4638.450 -21.359 < 2e-16 ***
## houses$OverallQual 23635.007 1072.532 22.037 < 2e-16 ***
## houses$GrLivArea 45.346 2.489 18.218 < 2e-16 ***
## houses$GarageCars 14544.315 3022.681 4.812 1.65e-06 ***
## houses$GarageArea 17.133 10.468 1.637 0.102
## houses$TotalBsmtSF 31.501 2.904 10.848 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38900 on 1454 degrees of freedom
## Multiple R-squared: 0.7611, Adjusted R-squared: 0.7603
## F-statistic: 926.5 on 5 and 1454 DF, p-value: < 2.2e-16
Sales Price = 23635.007(OverallQual) + 45.346(GrLivArea) + 14544.315(GarageCars) + 17.133(GarageArea) + 31.501(TotalBsmtSF)
SalePrice = 23635.007*(houses$OverallQual) + 45.346*(houses$GrLivArea) + 14544.315*(houses$GarageCars) + 17.133*(houses$GarageArea) + 31.501*(houses$TotalBsmtSF)
kaggle_submission = cbind(houses_test$Id,SalePrice)## Warning in cbind(houses_test$Id, SalePrice): number of rows of result is
## not a multiple of vector length (arg 1)
colnames(kaggle_submission)[1] <- "Id"
write.csv(kaggle_submission, file = "dqkaggle.csv", quote=FALSE, row.names=FALSE)Kaggle score