Your final is due by the end of day on 5/20/2018 You should post your solutions to your GitHub account or RPubs. You are also expected to make a short presentation via YouTube and post that recording to the board. This project will show off your ability to understand the elements of the class.

You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition.

https://www.kaggle.com/c/house-prices-advanced-regression-techniques . I want you to do the following.

. Pick one of the quantitative independent variables from the training data set (train.csv) , and define that variable as X. Make sure this variable is skewed to the right! #Ans

We pick LotArea as the quantitative independent variables and salesPrice as the Y variable. Most of the decision to purchase a property seems to be driven by the area of the lot of that property. First let’s load the data

library(MASS)
library(knitr)
## Warning: package 'knitr' was built under R version 3.4.4
#library(dplyr)
library(ggplot2)
#library(DT)
#library(reshape)
#library(corrplot)
#library(Rmisc)
#setwd("dir")
library(magrittr)
## Warning: package 'magrittr' was built under R version 3.4.4
getwd()
## [1] "C:/Users/hangr/Documents/SPS"
lot_price<-read.csv("train.csv")
head(lot_price)

Let’s create a subset of the train dataset.

lotprice_sub<-lot_price[,c("SalePrice", "LotArea")]
head(lotprice_sub)
X<-lotprice_sub$LotArea
Y<-lotprice_sub$SalePrice

plot(X,Y, main="Lot Area and Sale Price", xlab = "Lot Area", ylab = "Sale Price")
abline(lm(Y ~ X), col="red", lwd = 3)

hist(X, col="blue", main = "histogram of Lot Area", xlab = "Lot Area")

hist(Y, col="green", main = "Histogram of Sale Price", xlab="Sale Price")

#Summary of Lot Area
summary(X)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1300    7554    9478   10517   11602  215245
#Summary of Sale Price
summary(Y)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129975  163000  180921  214000  755000

Probability.

Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 1st quartile of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities. In addition, make a table of counts as shown below.

X<-lotprice_sub$LotArea
Y<-lotprice_sub$SalePrice

XQ1<- quantile(X, 0.25)
YQ1<-quantile(Y, 0.25)

n<-(nrow(lotprice_sub))
  1. P(X>x | Y>y)
#P(A|B) Probability of A given that B has occurred is equal to P(A,B)/P(B)

lotarea<-as.numeric(lotprice_sub$LotArea)
saleprice<-as.numeric(lotprice_sub$SalePrice)

nYQ1<-nrow(subset(lotprice_sub, saleprice>YQ1))

p1<-nrow(subset(lotprice_sub, lotarea>XQ1 & saleprice>YQ1))/nYQ1
p1
## [1] 0.8200913
  1. P(X>x, Y>y)
p2<-nrow(subset(lotprice_sub, lotarea>XQ1 & saleprice>YQ1))/n
p2
## [1] 0.6150685
  1. P(Xy)
p3<-nrow(subset(lotprice_sub, lotarea<XQ1 & saleprice>YQ1))/nYQ1
p3
## [1] 0.1799087
c1<-nrow(subset(lotprice_sub, lotarea <=XQ1 & saleprice<=YQ1))/n
c2<-nrow(subset(lotprice_sub, lotarea <=XQ1 & saleprice>YQ1))/n
c3<-c1+c2
c4<-nrow(subset(lotprice_sub, lotarea > XQ1 & saleprice<=YQ1))/n
c5<-nrow(subset(lotprice_sub, lotarea > XQ1 & saleprice > YQ1))/n
c6<-c4+c5
c7<-c1+c4
c8<-c2+c5
c9<-c3+c6
m_lotprice<-matrix(round(c(c1,c2,c3,c4,c5,c6,c7,c8,c9), 3), ncol=3, nrow=3, byrow=TRUE)
colnames(m_lotprice)<-c("<=1st quartile",
                        ">1st quartile",
                        "Total")
rownames(m_lotprice)<-c("<=1st quartile", ">1st quartile", "Total")
m_lotprice<-as.table(m_lotprice)
m_lotprice
##                <=1st quartile >1st quartile Total
## <=1st quartile          0.115         0.135 0.250
## >1st quartile           0.135         0.615 0.750
## Total                   0.250         0.750 1.000

Independence

Does splitting the training data in this fashion make them independent? Let A be the new variable counting those observations above the 1st quartile for X, and let B be the new variable counting those observations above the 1st quartile for Y. Does P(AB)=P(A)P(B)? Check mathematically, and then evaluate by running a Chi Square test for association.

Splittng doesn’t make them independent it allows us to find a wayfor testing independence using the chi-squared test.

papb<-c4*c5
print(paste("p(A)*p(B) =", round(papb, 3)))
## [1] "p(A)*p(B) = 0.083"

\[ P(A|B)=p(X>x|Y>y)=0.82 \] \[ p(A)*p(B)=0.083 \] \[ P(A|B)!=p(A)*p(B) \]

m_vals<-round(m_lotprice*1460,0)
m_vals
##                <=1st quartile >1st quartile Total
## <=1st quartile            168           197   365
## >1st quartile             197           898  1095
## Total                     365          1095  1460
m_chitst<-matrix(c(168,197,197,898), 2,2, byrow=TRUE)
chisq.test(m_chitst, correct=TRUE)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  m_chitst
## X-squared = 113.27, df = 1, p-value < 2.2e-16

p-value is significantly smaller than 0.05 we can reject the null hypothesis and accept the alternative hypothesis that the variables seem to be dependent.

Descriptive and Inferential Statistics.

Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot of X and Y.

lotprice_sub_df<-data.frame(X,Y)
summary(lotprice_sub_df)
##        X                Y         
##  Min.   :  1300   Min.   : 34900  
##  1st Qu.:  7554   1st Qu.:129975  
##  Median :  9478   Median :163000  
##  Mean   : 10517   Mean   :180921  
##  3rd Qu.: 11602   3rd Qu.:214000  
##  Max.   :215245   Max.   :755000
library(ggplot2)
ggplot(lotprice_sub_df, aes(x=X)) + 
  geom_histogram(aes(y=..density..), colour="black", fill="#56B4E9")+
  geom_density(alpha=0.5) +
  xlab("Lot Area") +
  geom_vline(aes(xintercept=median(lotprice_sub_df$X)),
             color="blue", linetype="dashed", size=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(lotprice_sub_df, aes(x=Y)) + 
  geom_histogram(aes(y=..density..), colour="black", fill="#56B4E9", binwidth = 20000)+
  geom_density(alpha=0.5) +
  xlab("Sales Price") +
  geom_vline(aes(xintercept=median(lotprice_sub_df$Y)),
             color="blue", linetype="dashed", size=1)

ggplot(lotprice_sub_df, aes(x=X, y=Y)) +
  geom_point(shape=1) +    
  geom_smooth(method=lm,   
              se=TRUE)

Correlation

Derive a correlation matrix for any THREE quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide a 92% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

var_lotprice<-data.frame(lot_price$LotArea, lot_price$YearBuilt,lot_price$SalePrice)
cor(var_lotprice)
##                     lot_price.LotArea lot_price.YearBuilt
## lot_price.LotArea          1.00000000          0.01422765
## lot_price.YearBuilt        0.01422765          1.00000000
## lot_price.SalePrice        0.26384335          0.52289733
##                     lot_price.SalePrice
## lot_price.LotArea             0.2638434
## lot_price.YearBuilt           0.5228973
## lot_price.SalePrice           1.0000000
cor.test(lotprice_sub$LotArea, lotprice_sub$SalePrice, conf.level = 0.92)
## 
##  Pearson's product-moment correlation
## 
## data:  lotprice_sub$LotArea and lotprice_sub$SalePrice
## t = 10.445, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
##  0.2206794 0.3059759
## sample estimates:
##       cor 
## 0.2638434
cor.test(lot_price$YearBuilt, lot_price$SalePrice, conf.level = 0.92)
## 
##  Pearson's product-moment correlation
## 
## data:  lot_price$YearBuilt and lot_price$SalePrice
## t = 23.424, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
##  0.4887787 0.5554189
## sample estimates:
##       cor 
## 0.5228973

With 92% confidence the correlation between Lot area and Sale price is between 0.22 and 0.31. The p-value is closed to zero. It seems there is a relation between those two variables.

Linear Algebra and Correlation.

Invert your 3 x 3 correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.

cor_mat<-cor(var_lotprice)
cor_mat
##                     lot_price.LotArea lot_price.YearBuilt
## lot_price.LotArea          1.00000000          0.01422765
## lot_price.YearBuilt        0.01422765          1.00000000
## lot_price.SalePrice        0.26384335          0.52289733
##                     lot_price.SalePrice
## lot_price.LotArea             0.2638434
## lot_price.YearBuilt           0.5228973
## lot_price.SalePrice           1.0000000
prec_mat<-solve(cor_mat)
prec_mat
##                     lot_price.LotArea lot_price.YearBuilt
## lot_price.LotArea           1.0997293           0.1872824
## lot_price.YearBuilt         0.1872824           1.4082080
## lot_price.SalePrice        -0.3880857          -0.7857614
##                     lot_price.SalePrice
## lot_price.LotArea            -0.3880857
## lot_price.YearBuilt          -0.7857614
## lot_price.SalePrice           1.5132664
#matrix by precision matrix
round(cor_mat %*% prec_mat, digits = 0 )
##                     lot_price.LotArea lot_price.YearBuilt
## lot_price.LotArea                   1                   0
## lot_price.YearBuilt                 0                   1
## lot_price.SalePrice                 0                   0
##                     lot_price.SalePrice
## lot_price.LotArea                     0
## lot_price.YearBuilt                   0
## lot_price.SalePrice                   1
#precision matrix by matrix
round(prec_mat %*% cor_mat, digits = 0 )
##                     lot_price.LotArea lot_price.YearBuilt
## lot_price.LotArea                   1                   0
## lot_price.YearBuilt                 0                   1
## lot_price.SalePrice                 0                   0
##                     lot_price.SalePrice
## lot_price.LotArea                     0
## lot_price.YearBuilt                   0
## lot_price.SalePrice                   1

Multiplying the matrix and the precision matrix result in the identity matrix.

Calculus-Based Probability & Statistics.

Many times, it makes sense to fit a closed form distribution to data. For the first variable that you selected which is skewed to the right, shift it so that the minimum value is above zero as necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R- devel/library/MASS/html/fitdistr.html ).

suppressWarnings(suppressMessages(library(MASS)))
min(lot_price$TotalBsmtSF)
## [1] 0

Shift above 0 by adding a very small number

TotalBsmtSF<-lot_price$TotalBsmtSF + 0.0000001
min(TotalBsmtSF)
## [1] 1e-07

EXponential distribution

#library(stats)
fit<-fitdistr(TotalBsmtSF, "exponential")

#Get lambda
(lambda<-fit$estimate)
##         rate 
## 0.0009456896

Sample of 1000

sample<- rexp(1000, lambda)

Histogram - Simulated vs Observed

hist(sample)

hist(lot_price$TotalBsmtSF)

Find the optimal value of ?? for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, ??)). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss. Using the cumulative distribution function find the 5th and 95th percentile

Percentile is given by \[ \frac{log(1-p)}{-\lambda} \] Find the 5th percentile

qexp(0.05, rate=lambda)
## [1] 54.23904

Find the 95th percentile

qexp(0.95, rate = lambda)
## [1] 3167.776

Provide the empirical 5th and 95th percentile of the data. Discuss

#5th percentile
quantile(lot_price$LotArea, 0.05)
##     5% 
## 3311.7
#95th percentile
quantile(lot_price$LotArea, 0.95)
##      95% 
## 17401.15

Modeling.

Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score

fit <- lm(lot_price$SalePrice ~ lot_price$OverallQual + lot_price$GrLivArea + lot_price$GarageCars + lot_price$GarageArea, data=lot_price)
summary(fit)
## 
## Call:
## lm(formula = lot_price$SalePrice ~ lot_price$OverallQual + lot_price$GrLivArea + 
##     lot_price$GarageCars + lot_price$GarageArea, data = lot_price)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -372594  -21236   -1594   18625  301129 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -98436.050   4820.467 -20.420  < 2e-16 ***
## lot_price$OverallQual  26988.854   1067.393  25.285  < 2e-16 ***
## lot_price$GrLivArea       49.573      2.555  19.402  < 2e-16 ***
## lot_price$GarageCars   11317.522   3126.297   3.620 0.000305 ***
## lot_price$GarageArea      41.478     10.627   3.903 9.93e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 40420 on 1455 degrees of freedom
## Multiple R-squared:  0.7418, Adjusted R-squared:  0.7411 
## F-statistic:  1045 on 4 and 1455 DF,  p-value: < 2.2e-16

SalePrice = -98436.050 +26988.854OverallQual+45.573GrLivArea+11317.522GarageCars + 41.478GarageArea

par(mfrow=c(2,2))
X1<-lot_price$OverallQual
X2<-lot_price$GrLivArea
X3<-lot_price$GarageCars
X4<-lot_price$GarageArea
Y<-lot_price$SalePrice

plot(X1,Y, col="#f06292", main="OverallQual", ylab="Sale Price")
abline(lm(Y~X1), col="yellow", lwd=3) # regression line (y~x)

plot(X2,Y, col="#9c27b0", main="GrLivArea", ylab="Sale Price")
abline(lm(Y~X2), col="yellow", lwd=3) # regression line (y~x)

plot(X3,Y, col="#ce93d8", main="GarageCars", ylab="Sale Price")
abline(lm(Y~X3), col="yellow", lwd=3) # regression line (y~x)

plot(X4,Y, col="#c2185b", main="GarageArea", ylab="Sale Price")
abline(lm(Y~X4), col="yellow", lwd=3) # regression line (y~x)

library(magrittr)
getwd()
## [1] "C:/Users/hangr/Documents/SPS"
df_test<-read.csv("mytest.csv")
head(df_test)
SalePrice<-((26988.854*lot_price$OverallQual) + (49.573*lot_price$GrLivArea) +  (11317.522*lot_price$GarageCars) + (41.478*lot_price$GarageArea) -98436.050)

df_test<-df_test[,c("Id","OverallQual","GrLivArea","GarageCars","GarageArea")]
library(knitr)
kable(head(df_test))
Id OverallQual GrLivArea GarageCars GarageArea
1461 5 896 1 730
1462 6 1329 1 312
1463 5 1629 2 482
1464 6 1604 2 470
1465 8 1280 2 506
1466 6 1655 2 440
submission<-cbind(df_test$Id, SalePrice)
## Warning in cbind(df_test$Id, SalePrice): number of rows of result is not a
## multiple of vector length (arg 1)
colnames(submission)[1]<-"Id"
submission[submission<0]<-median(SalePrice)
submission<-as.data.frame(submission[1:1459,])
kable(head(submission))
Id SalePrice
1461 220620.7
1462 167773.1
1463 226877.0
1464 236184.2
1465 295064.4
1466 146571.1

Export csv and load it to kaggle

write.csv(submission, file="submissionHA.csv", quote = FALSE, row.names = FALSE)

Username : hangrand Kaggle score: 0.60113