Variables

We are going to pick the total square feet of the basement to see how that relates to the home’s price.

library(MASS)
library(knitr)
library(dplyr)
library(ggplot2)
library(reshape2)
houses = read.csv("/Users/admin/Documents/Data 605/train.csv", header = TRUE)
houses_test = read.csv("/Users/admin/Documents/Data 605/test.csv", header = TRUE)

ggplot(houses, aes(x = TotalBsmtSF) )+
  geom_density(fill = "5")

X<-houses$TotalBsmtSF
Y<-houses$SalePrice

plot(X,Y, col="green", main="Total SqFt of Basement and Sale Price", xlab = "Total SqFt of Basement", ylab="Sale Price")
abline(lm(Y~X), col="yellow", lwd=3) 

summary(Y)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129975  163000  180921  214000  755000

Probability

#1st quartile of Total Basement SqFt
xQ1 <- quantile(X, 0.25)
xQ1
##    25% 
## 795.75
#1st quartile of Sale Price
yQ1 <- quantile(Y, 0.25)
yQ1
##    25% 
## 129975

a) P(X>x | Y>y)

n<-(nrow(houses))

nyQ1<-nrow(subset(houses,as.numeric(Y)>yQ1))


P_a<-nrow(subset(houses, as.numeric(X) > xQ1 & as.numeric(Y)>yQ1))/nyQ1
P_a
## [1] 0.830137

b) P(X>x, Y>y)

P_b<-nrow(subset(houses, as.numeric(X) > xQ1 &  as.numeric(Y)>yQ1))/n
P_b
## [1] 0.6226027

c) P(Xy)

P_c<-nrow(subset(houses, as.numeric(X) < xQ1 & as.numeric(Y)>yQ1))/nyQ1
P_c
## [1] 0.169863

Total

c1<-nrow(subset(houses, as.numeric(X) <=xQ1 & as.numeric(Y)<=yQ1))/n
c2<-nrow(subset(houses, as.numeric(X) <=xQ1 & as.numeric(Y)>yQ1))/n
c3<-c1+c2
c4<-nrow(subset(houses, as.numeric(X) >xQ1 & as.numeric(Y)<=yQ1))/n
c5<-nrow(subset(houses, as.numeric(X) >xQ1 & as.numeric(Y)>yQ1))/n
c6<-c4+c5
c7<-c1+c4
c8<-c2+c5
c9<-c3+c6

total<-matrix(round(c(c1,c2,c3,c4,c5,c6,c7,c8,c9),3), ncol=3, nrow=3, byrow=TRUE)
colnames(total)<-c(
"<=1st quartile",
">1st quartile",
"Total")
rownames(total)<-c("<=1st quartile",">1st quartile","Total")

print(as.table(total))
##                <=1st quartile >1st quartile Total
## <=1st quartile          0.123         0.127 0.250
## >1st quartile           0.127         0.623 0.750
## Total                   0.250         0.750 1.000

Check for Independce

print(paste("P(A)*p(B)=",round(c4*c5,3)))
## [1] "P(A)*p(B)= 0.079"
print(paste("P(A|B)=",round(nrow(subset(houses, as.numeric(X) > xQ1 & as.numeric(Y)>yQ1))/nyQ1,3)))
## [1] "P(A|B)= 0.83"
print("P(A|B) != P(A)*p(B)")
## [1] "P(A|B) != P(A)*p(B)"

Chi-square

chi<- table(X, Y)
chisq.test(chi)
## Warning in chisq.test(chi): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  chi
## X-squared = 509710, df = 476640, p-value < 2.2e-16

p-value < 2.2e-16 so we can ignore the hypothesis that the variables are independent. There is a dependence between total basement square feet and sales price

Descriptive and Inferential Stats

ggplot(houses, aes(x = TotalBsmtSF) )+
  geom_density(fill = "5")

plot(X,Y, col="green", main="Total SqFt of Basement and Sale Price", xlab = "Total SqFt of Basement", ylab="Sale Price")
abline(lm(Y~X), col="yellow", lwd=3) 

Confidence Interval

cor.test(X, Y, conf.level = 0.99)
## 
##  Pearson's product-moment correlation
## 
## data:  X and Y
## t = 29.671, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
##  0.5697562 0.6539251
## sample estimates:
##       cor 
## 0.6135806
t.test(X,Y,conf.level = 0.99)
## 
##  Welch Two Sample t-test
## 
## data:  X and Y
## t = -86.509, df = 1459.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 99 percent confidence interval:
##  -185226.3 -174501.2
## sample estimates:
##  mean of x  mean of y 
##   1057.429 180921.196

Looking at the results we see that the mean sale price is between $175K and $185K. The p value is small so we can conclude that there is a relationship between the total basement sq ft and the sale price.

Linear Algebra and Correlation

cor_matrix <- cor(select(houses, TotalBsmtSF, SalePrice))
cor_matrix
##             TotalBsmtSF SalePrice
## TotalBsmtSF   1.0000000 0.6135806
## SalePrice     0.6135806 1.0000000
inv_cor_matrix <- ginv(cor_matrix)
inv_cor_matrix
##            [,1]       [,2]
## [1,]  1.6038006 -0.9840609
## [2,] -0.9840609  1.6038006
cor_matrix %*% inv_cor_matrix
##                      [,1]         [,2]
## TotalBsmtSF  1.000000e+00 5.551115e-16
## SalePrice   -2.220446e-16 1.000000e+00
inv_cor_matrix %*% cor_matrix
##       TotalBsmtSF SalePrice
## [1,] 1.000000e+00         0
## [2,] 3.330669e-16         1

The indentity matrix is returned for both.

Calculus Based Probability

min_sqft = min(houses$TotalBsmtSF)+.00001
fit <- fitdistr(X, "exponential")
lambda <- fit$estimate

sampledf <- rexp(1000, lambda)
sampledf<-data.frame(as.numeric(sampledf))

colnames(sampledf)[1] <- "sample"
hist(sampledf$sample,  main="Histogram of Exponential Distribution", xlab = "Basement SqFt", breaks=30)

print(paste("5th percentile=",qexp(.05,rate = lambda)))
## [1] "5th percentile= 54.2390401783128"
print(paste("95th percentile=",qexp(.95,rate = lambda)))
## [1] "95th percentile= 3167.77553652706"
quantile(X, .05)
##    5% 
## 519.3
quantile(X, .95)
##  95% 
## 1753

Modeling

#Make a dataframe with only numeric houses data
houses_num = dplyr::select_if(houses, is.numeric)

#Get correlation from houses data
houeses_cor<-cor(houses_num)
houses_correlation = na.omit(melt(houeses_cor)) 

#Get fields with the top 6 sales correlation (ignore saless price)
sale_cor = houses_correlation[which(houses_correlation$Var1 == 'SalePrice'),]
head(sale_cor[order(-abs(sale_cor$value)),],6)
##           Var1        Var2     value
## 1444 SalePrice   SalePrice 1.0000000
## 190  SalePrice OverallQual 0.7909816
## 646  SalePrice   GrLivArea 0.7086245
## 1026 SalePrice  GarageCars 0.6404092
## 1064 SalePrice  GarageArea 0.6234314
## 494  SalePrice TotalBsmtSF 0.6135806
#Fit Model with the top 5
fit <- lm(houses$SalePrice ~ houses$OverallQual + houses$GrLivArea + houses$GarageCars + houses$GarageArea +houses$TotalBsmtSF, data=houses)
summary(fit)
## 
## Call:
## lm(formula = houses$SalePrice ~ houses$OverallQual + houses$GrLivArea + 
##     houses$GarageCars + houses$GarageArea + houses$TotalBsmtSF, 
##     data = houses)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -478977  -19915   -1503   16701  287132 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -99072.050   4638.450 -21.359  < 2e-16 ***
## houses$OverallQual  23635.007   1072.532  22.037  < 2e-16 ***
## houses$GrLivArea       45.346      2.489  18.218  < 2e-16 ***
## houses$GarageCars   14544.315   3022.681   4.812 1.65e-06 ***
## houses$GarageArea      17.133     10.468   1.637    0.102    
## houses$TotalBsmtSF     31.501      2.904  10.848  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38900 on 1454 degrees of freedom
## Multiple R-squared:  0.7611, Adjusted R-squared:  0.7603 
## F-statistic: 926.5 on 5 and 1454 DF,  p-value: < 2.2e-16

Sales Price = 23635.007(OverallQual) + 45.346(GrLivArea) + 14544.315(GarageCars) + 17.133(GarageArea) + 31.501(TotalBsmtSF)

SalePrice = 23635.007*(houses$OverallQual) + 45.346*(houses$GrLivArea) + 14544.315*(houses$GarageCars) + 17.133*(houses$GarageArea) + 31.501*(houses$TotalBsmtSF)

kaggle_submission = cbind(houses_test$Id,SalePrice)
## Warning in cbind(houses_test$Id, SalePrice): number of rows of result is
## not a multiple of vector length (arg 1)
colnames(kaggle_submission)[1] <- "Id"

write.csv(kaggle_submission, file = "dqkaggle.csv", quote=FALSE, row.names=FALSE)

Kaggle Score

Kaggle score

Kaggle score