library(MASS)
library(knitr)
library(dplyr)
library(ggplot2)
library(DT)
library(reshape)
library(corrplot)
library(Rmisc)
df <- read.csv("train.csv")

Introduction

Below is the dataset of house prices available from Kaggle.com. The dataset has 1459 observations of houses in Ames, Iowa, and 79 variables potentially contributing to the house sale price.

The full dataset and dictionary are available at: https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data

#kable(head(df))
datatable(df, options = list( pageLength = 5, lengthMenu = c(5, 10, 40),   initComplete = JS(
    "function(settings, json) {",
    "$(this.api().table().header()).css({'background-color': '#01975b', 'color': '#fff'});",
    "}")), rownames=TRUE)

Part 1: Variables

Pick one of the quanititative independent variables from the training data set (train.csv) , and define that variable as X. Make sure this variable is skewed to the right! Pick the dependent variable and define it as Y.

#test variable
X1<-df$OverallQual
Y1<-df$SalePrice

plot(X1,Y1)
hist(Y1, col="blue", main="Histogram of Overall Quality")
#chosen variable
X<-df$YearBuilt
Y<-df$SalePrice

plot(X,Y, col="#4caf50", main="Scatterplot of Year Built and Sale Price", xlab = "Year Built", ylab="Sale Price")
abline(lm(Y~X), col="yellow", lwd=3) # regression line (y~x) 

hist(X, col="green", main="Histogram of Year Built", xlab = "Year Built")

hist(Y, col="#80cbc4", main="Histogram of Sale Price", xlab = "Sale Price")

print("Summary of X variable: Year Built")
## [1] "Summary of X variable: Year Built"
summary(X)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1872    1954    1973    1971    2000    2010
print("Summary of Y variable: Sale Price")
## [1] "Summary of Y variable: Sale Price"
summary(Y)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  130000  163000  180900  214000  755000

Part 2: Probability

Probability. Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 3d quartile of the X variable, and the small letter “y” is estimated as the 2d quartile of the Y variable. Interpret the meaning of all probabilities. In addition, make a table of counts as shown below.

  1. \[p_1 =p(X>x | Y>y) \]

Given an above median sale price, the probability that a house has a year built greater than the third quartile.

XQ3<-quantile(X, probs=0.75) #2000 #3rd quartile of X variable
YQ2<-quantile (Y, probs=0.50) #163000 #2nd quartile, or median, of Y variable

n<-(nrow(df))
yearbuilt<-as.numeric(df$YearBuilt)
saleprice<-as.numeric(df$SalePrice)

nYQ2<-nrow(subset(df,saleprice>YQ2))


p1<-nrow(subset(df, yearbuilt > XQ3 & saleprice>YQ2))/nYQ2
p1
## [1] 0.4436813
  1. \[p_2 = p(X>x , Y>y) \]

Given the complete data set, the probability that a house has a year built greater than the third quartile and a sale price above median value.

p2<-nrow(subset(df, yearbuilt > XQ3 & saleprice>YQ2))/n
p2
## [1] 0.2212329
  1. \[p_3 =p(X<x | Y>y) \]

Given an above median selling price, the probability that a house has a year built less than [less than or equal to] the third quartile.

p3<-nrow(subset(df, yearbuilt <=XQ3 & saleprice>YQ2))/nYQ2
p3
## [1] 0.5563187
c1<-nrow(subset(df, yearbuilt <=XQ3 & saleprice<=YQ2))/n
c2<-nrow(subset(df, yearbuilt <=XQ3 & saleprice>YQ2))/n
c3<-c1+c2
c4<-nrow(subset(df, yearbuilt >XQ3 & saleprice<=YQ2))/n
c5<-nrow(subset(df, yearbuilt >XQ3 & saleprice>YQ2))/n
c6<-c4+c5
c7<-c1+c4
c8<-c2+c5
c9<-c3+c6
dfcounts<-matrix(round(c(c1,c2,c3,c4,c5,c6,c7,c8,c9),3), ncol=3, nrow=3, byrow=TRUE)
colnames(dfcounts)<-c(
"<=2d quartile",
">2d quartile",
"Total")
rownames(dfcounts)<-c("<=3rd quartile",">3rd quartile","Total")

print("Quartile Matrix by Percentage")
## [1] "Quartile Matrix by Percentage"
dfcounts<-as.table(dfcounts)
dfcounts
##                <=2d quartile >2d quartile Total
## <=3rd quartile         0.473        0.277 0.751
## >3rd quartile          0.028        0.221 0.249
## Total                  0.501        0.499 1.000
print("Quartile Matrix by Count")
## [1] "Quartile Matrix by Count"
dfvals<-round(dfcounts*1460,0)
dfvals
##                <=2d quartile >2d quartile Total
## <=3rd quartile           691          404  1096
## >3rd quartile             41          323   364
## Total                    731          729  1460

Part 3: Independence

Does splitting the training data in this fashion make them independent? Let A be the new variable counting those observations above the 3d quartile for X, and let B be the new variable counting those observations for the 2d quartile for Y. Does P(A|B)=P(A)P(B)? Check mathematically, and then evaluate by running a Chi Square test for association.

papb<-c4*c5
print (paste0("p(A)*p(B)=", round(papb,5)))
## [1] "p(A)*p(B)=0.00621"

\[p(A|B)=p(X>x|Y>y)=0.444\]

\[p(A)*p(B)=0.006\]

\[p(A|B) != p(A)*p(B)\]

mat <- matrix(c(691, 404, 41, 323), 2, 2, byrow=T) 

chisq.test(mat, correct=TRUE) 
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  mat
## X-squared = 291.61, df = 1, p-value < 2.2e-16
#test of alternate chi sq approach
A<-subset(df, df$YearBuilt>XQ3)
B<-subset(df, df$SalePrice>YQ2)
chisq.test(A, B) #issue with variable class

Part 4: Statistics

Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot of X and Y.

Also see Part 1.

isnum <- sapply(df, is.numeric)
dfnum<-df[ , isnum]
summary(dfnum)
##        Id           MSSubClass     LotFrontage        LotArea      
##  Min.   :   1.0   Min.   : 20.0   Min.   : 21.00   Min.   :  1300  
##  1st Qu.: 365.8   1st Qu.: 20.0   1st Qu.: 59.00   1st Qu.:  7554  
##  Median : 730.5   Median : 50.0   Median : 69.00   Median :  9478  
##  Mean   : 730.5   Mean   : 56.9   Mean   : 70.05   Mean   : 10517  
##  3rd Qu.:1095.2   3rd Qu.: 70.0   3rd Qu.: 80.00   3rd Qu.: 11602  
##  Max.   :1460.0   Max.   :190.0   Max.   :313.00   Max.   :215245  
##                                   NA's   :259                      
##   OverallQual      OverallCond      YearBuilt     YearRemodAdd 
##  Min.   : 1.000   Min.   :1.000   Min.   :1872   Min.   :1950  
##  1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954   1st Qu.:1967  
##  Median : 6.000   Median :5.000   Median :1973   Median :1994  
##  Mean   : 6.099   Mean   :5.575   Mean   :1971   Mean   :1985  
##  3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000   3rd Qu.:2004  
##  Max.   :10.000   Max.   :9.000   Max.   :2010   Max.   :2010  
##                                                                
##    MasVnrArea       BsmtFinSF1       BsmtFinSF2        BsmtUnfSF     
##  Min.   :   0.0   Min.   :   0.0   Min.   :   0.00   Min.   :   0.0  
##  1st Qu.:   0.0   1st Qu.:   0.0   1st Qu.:   0.00   1st Qu.: 223.0  
##  Median :   0.0   Median : 383.5   Median :   0.00   Median : 477.5  
##  Mean   : 103.7   Mean   : 443.6   Mean   :  46.55   Mean   : 567.2  
##  3rd Qu.: 166.0   3rd Qu.: 712.2   3rd Qu.:   0.00   3rd Qu.: 808.0  
##  Max.   :1600.0   Max.   :5644.0   Max.   :1474.00   Max.   :2336.0  
##  NA's   :8                                                           
##   TotalBsmtSF       X1stFlrSF      X2ndFlrSF     LowQualFinSF    
##  Min.   :   0.0   Min.   : 334   Min.   :   0   Min.   :  0.000  
##  1st Qu.: 795.8   1st Qu.: 882   1st Qu.:   0   1st Qu.:  0.000  
##  Median : 991.5   Median :1087   Median :   0   Median :  0.000  
##  Mean   :1057.4   Mean   :1163   Mean   : 347   Mean   :  5.845  
##  3rd Qu.:1298.2   3rd Qu.:1391   3rd Qu.: 728   3rd Qu.:  0.000  
##  Max.   :6110.0   Max.   :4692   Max.   :2065   Max.   :572.000  
##                                                                  
##    GrLivArea     BsmtFullBath     BsmtHalfBath        FullBath    
##  Min.   : 334   Min.   :0.0000   Min.   :0.00000   Min.   :0.000  
##  1st Qu.:1130   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:1.000  
##  Median :1464   Median :0.0000   Median :0.00000   Median :2.000  
##  Mean   :1515   Mean   :0.4253   Mean   :0.05753   Mean   :1.565  
##  3rd Qu.:1777   3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:2.000  
##  Max.   :5642   Max.   :3.0000   Max.   :2.00000   Max.   :3.000  
##                                                                   
##     HalfBath       BedroomAbvGr    KitchenAbvGr    TotRmsAbvGrd   
##  Min.   :0.0000   Min.   :0.000   Min.   :0.000   Min.   : 2.000  
##  1st Qu.:0.0000   1st Qu.:2.000   1st Qu.:1.000   1st Qu.: 5.000  
##  Median :0.0000   Median :3.000   Median :1.000   Median : 6.000  
##  Mean   :0.3829   Mean   :2.866   Mean   :1.047   Mean   : 6.518  
##  3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:1.000   3rd Qu.: 7.000  
##  Max.   :2.0000   Max.   :8.000   Max.   :3.000   Max.   :14.000  
##                                                                   
##    Fireplaces     GarageYrBlt     GarageCars      GarageArea    
##  Min.   :0.000   Min.   :1900   Min.   :0.000   Min.   :   0.0  
##  1st Qu.:0.000   1st Qu.:1961   1st Qu.:1.000   1st Qu.: 334.5  
##  Median :1.000   Median :1980   Median :2.000   Median : 480.0  
##  Mean   :0.613   Mean   :1979   Mean   :1.767   Mean   : 473.0  
##  3rd Qu.:1.000   3rd Qu.:2002   3rd Qu.:2.000   3rd Qu.: 576.0  
##  Max.   :3.000   Max.   :2010   Max.   :4.000   Max.   :1418.0  
##                  NA's   :81                                     
##    WoodDeckSF      OpenPorchSF     EnclosedPorch      X3SsnPorch    
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00  
##  Median :  0.00   Median : 25.00   Median :  0.00   Median :  0.00  
##  Mean   : 94.24   Mean   : 46.66   Mean   : 21.95   Mean   :  3.41  
##  3rd Qu.:168.00   3rd Qu.: 68.00   3rd Qu.:  0.00   3rd Qu.:  0.00  
##  Max.   :857.00   Max.   :547.00   Max.   :552.00   Max.   :508.00  
##                                                                     
##   ScreenPorch        PoolArea          MiscVal             MoSold      
##  Min.   :  0.00   Min.   :  0.000   Min.   :    0.00   Min.   : 1.000  
##  1st Qu.:  0.00   1st Qu.:  0.000   1st Qu.:    0.00   1st Qu.: 5.000  
##  Median :  0.00   Median :  0.000   Median :    0.00   Median : 6.000  
##  Mean   : 15.06   Mean   :  2.759   Mean   :   43.49   Mean   : 6.322  
##  3rd Qu.:  0.00   3rd Qu.:  0.000   3rd Qu.:    0.00   3rd Qu.: 8.000  
##  Max.   :480.00   Max.   :738.000   Max.   :15500.00   Max.   :12.000  
##                                                                        
##      YrSold       SalePrice     
##  Min.   :2006   Min.   : 34900  
##  1st Qu.:2007   1st Qu.:129975  
##  Median :2008   Median :163000  
##  Mean   :2008   Mean   :180921  
##  3rd Qu.:2009   3rd Qu.:214000  
##  Max.   :2010   Max.   :755000  
## 

Confidence interval

Provide a 95% CI for the difference in the mean of the variables.

#t.test(x,y)
t.test(df$YearBuilt, df$SalePrice)
## 
##  Welch Two Sample t-test
## 
## data:  df$YearBuilt and df$SalePrice
## t = -86.071, df = 1459, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -183028.3 -174871.6
## sample estimates:
##  mean of x  mean of y 
##   1971.268 180921.196

Selective correlation matrix for chosen variables

Derive a correlation matrix for two of the quantitative variables you selected.

Test the hypothesis that the correlation between these variables is 0 and provide a 99% confidence interval. Discuss the meaning of your analysis.

myvars<-data.frame(df$YearBuilt, df$SalePrice)
#head(myvars) #view header
cor(myvars)
##              df.YearBuilt df.SalePrice
## df.YearBuilt    1.0000000    0.5228973
## df.SalePrice    0.5228973    1.0000000
cor.test(df$YearBuilt, df$SalePrice, conf.level = 0.99)
## 
##  Pearson's product-moment correlation
## 
## data:  df$YearBuilt and df$SalePrice
## t = 23.424, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
##  0.4721529 0.5701878
## sample estimates:
##       cor 
## 0.5228973
t.test(df$YearBuilt, df$SalePrice, conf.level = 0.99)
## 
##  Welch Two Sample t-test
## 
## data:  df$YearBuilt and df$SalePrice
## t = -86.071, df = 1459, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 99 percent confidence interval:
##  -184312.4 -173587.5
## sample estimates:
##  mean of x  mean of y 
##   1971.268 180921.196
mymx<-as.matrix(cor(myvars))

With a 99 percent confidence level, the correlation between Year Built and Sale Price is estimated to be between 0.47 and 0.57.

Part 5: Correlation

Linear Algebra and Correlation. Invert your correlation matrix. (This is known as the precision matrix and contains variance inflation factors on the diagonal.)

Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix.

Correlation Matrix, Precision Matrix, Identity Matrix

#my correlation matrix
mymx
##              df.YearBuilt df.SalePrice
## df.YearBuilt    1.0000000    0.5228973
## df.SalePrice    0.5228973    1.0000000
#inverse of my correlation matrix, precision matrix
ginvmymx<-ginv(mymx)
ginvmymx
##            [,1]       [,2]
## [1,]  1.3763140 -0.7196709
## [2,] -0.7196709  1.3763140
#corr mat x precision mat
mymxginv<-mymx%*%ginvmymx
round(mymxginv,2)
##              [,1] [,2]
## df.YearBuilt    1    0
## df.SalePrice    0    1
#precision mat x corr mat
ginvmymx<-ginvmymx%*%mymx
round(ginvmymx,2)
##      df.YearBuilt df.SalePrice
## [1,]            1            0
## [2,]            0            1

Principal Components Analysis

Conduct principal components analysis (research this!) and interpret. Discuss.

Header of all quantitative variables

#Correlation matrix of all quantitative variables in dataframe

kable(head(dfnum))
Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd Fireplaces GarageYrBlt GarageCars GarageArea WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch ScreenPorch PoolArea MiscVal MoSold YrSold SalePrice
1 60 65 8450 7 5 2003 2003 196 706 0 150 856 856 854 0 1710 1 0 2 1 3 1 8 0 2003 2 548 0 61 0 0 0 0 0 2 2008 208500
2 20 80 9600 6 8 1976 1976 0 978 0 284 1262 1262 0 0 1262 0 1 2 0 3 1 6 1 1976 2 460 298 0 0 0 0 0 0 5 2007 181500
3 60 68 11250 7 5 2001 2002 162 486 0 434 920 920 866 0 1786 1 0 2 1 3 1 6 1 2001 2 608 0 42 0 0 0 0 0 9 2008 223500
4 70 60 9550 7 5 1915 1970 0 216 0 540 756 961 756 0 1717 1 0 1 0 3 1 7 1 1998 3 642 0 35 272 0 0 0 0 2 2006 140000
5 60 84 14260 8 5 2000 2000 350 655 0 490 1145 1145 1053 0 2198 1 0 2 1 4 1 9 1 2000 3 836 192 84 0 0 0 0 0 12 2008 250000
6 50 85 14115 5 5 1993 1995 0 732 0 64 796 796 566 0 1362 1 0 1 1 1 1 5 0 1993 2 480 40 30 0 320 0 0 700 10 2009 143000

Header of correlation matrix for all quantitative variables

cormatrix<-cor(dfnum)
cordf<-as.data.frame(cormatrix)
kable(head(cordf))
Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd Fireplaces GarageYrBlt GarageCars GarageArea WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch ScreenPorch PoolArea MiscVal MoSold YrSold SalePrice
Id 1.0000000 0.0111565 NA -0.0332255 -0.0283648 0.0126089 -0.0127127 -0.0219976 NA -0.0050240 -0.0059677 -0.0079397 -0.0154146 0.0104960 0.0055898 -0.0442300 0.0082728 0.0022886 -0.0201547 0.0055875 0.0067838 0.0377186 0.0029512 0.0272387 -0.0197716 NA 0.0165697 0.0176338 -0.0296432 -0.0004769 0.0028892 -0.0466348 0.0013302 0.0570439 -0.0062424 0.0211722 0.0007118 -0.0219167
MSSubClass 0.0111565 1.0000000 NA -0.1397811 0.0326277 -0.0593158 0.0278501 0.0405810 NA -0.0698357 -0.0656486 -0.1407595 -0.2385184 -0.2517584 0.3078857 0.0464738 0.0748532 0.0034910 -0.0023325 0.1316082 0.1773544 -0.0234380 0.2817210 0.0403801 -0.0455693 NA -0.0401098 -0.0986715 -0.0125794 -0.0061001 -0.0120366 -0.0438245 -0.0260302 0.0082827 -0.0076833 -0.0135846 -0.0214070 -0.0842841
LotFrontage NA NA 1 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
LotArea -0.0332255 -0.1397811 NA 1.0000000 0.1058057 -0.0056363 0.0142277 0.0137884 NA 0.2141031 0.1111697 -0.0026184 0.2608331 0.2994746 0.0509859 0.0047790 0.2631162 0.1581545 0.0480456 0.1260306 0.0142595 0.1196899 -0.0177839 0.1900148 0.2713640 NA 0.1548707 0.1804028 0.1716977 0.0847738 -0.0183397 0.0204228 0.0431604 0.0776724 0.0380677 0.0012050 -0.0142614 0.2638434
OverallQual -0.0283648 0.0326277 NA 0.1058057 1.0000000 -0.0919323 0.5723228 0.5506839 NA 0.2396660 -0.0591187 0.3081589 0.5378085 0.4762238 0.2954929 -0.0304293 0.5930074 0.1110978 -0.0401502 0.5505997 0.2734581 0.1016764 -0.1838822 0.4274523 0.3967650 NA 0.6006707 0.5620218 0.2389234 0.3088188 -0.1139369 0.0303706 0.0648864 0.0651658 -0.0314062 0.0708152 -0.0273467 0.7909816
OverallCond 0.0126089 -0.0593158 NA -0.0056363 -0.0919323 1.0000000 -0.3759832 0.0737415 NA -0.0462309 0.0402292 -0.1368406 -0.1710975 -0.1442028 0.0289421 0.0254943 -0.0796859 -0.0549415 0.1178209 -0.1941495 -0.0607693 0.0129801 -0.0870009 -0.0575832 -0.0238200 NA -0.1857575 -0.1515214 -0.0033337 -0.0325888 0.0703562 0.0255037 0.0548105 -0.0019849 0.0687768 -0.0035108 0.0439497 -0.0778559

Header of variables with correlation greater than 0.5

#Source from http://stackoverflow.com/questions/7074246/show-correlations-as-an-ordered-list-not-as-a-large-matrix

cordf[cordf == 1] <- NA #drop correlation of 1, diagonals
cordf[abs(cordf) < 0.5] <- NA # drop correlations of less than 0.5
cordf<-as.matrix(cordf)
cordf2<- na.omit(melt(cordf)) 
kable(head(cordf2[order(-abs(cordf2$value)),])) # sort by highest correlations
X1 X2 value
1016 GarageArea GarageCars 0.8824754
1053 GarageCars GarageArea 0.8824754
632 TotRmsAbvGrd GrLivArea 0.8254894
891 GrLivArea TotRmsAbvGrd 0.8254894
470 X1stFlrSF TotalBsmtSF 0.8195300
507 TotalBsmtSF X1stFlrSF 0.8195300
#corrplot(cordf, type = "upper", tl.col = "black", tl.srt = 45)
#test of alternate corr approach
myvars<-data.frame(df$YearBuilt, df$SalePrice)
head(myvars)

All variables with correlation to Sale Price greater than 0.5

cordf2<-as.data.frame(cordf2)
# head(cordf2) #view head
# str(cordf2) #view structure
topcors <- cordf2[ which(cordf2$X2=='SalePrice'),]

topcorsdf<-topcors[order(-abs(topcors$value)),]# sort by highest correlations
cors1<-data.frame(topcorsdf$X1,topcorsdf$X2,topcorsdf$value)
kable(cors1)
topcorsdf.X1 topcorsdf.X2 topcorsdf.value
OverallQual SalePrice 0.7909816
GrLivArea SalePrice 0.7086245
GarageCars SalePrice 0.6404092
GarageArea SalePrice 0.6234314
TotalBsmtSF SalePrice 0.6135806
X1stFlrSF SalePrice 0.6058522
FullBath SalePrice 0.5606638
TotRmsAbvGrd SalePrice 0.5337232
YearBuilt SalePrice 0.5228973
YearRemodAdd SalePrice 0.5071010

Plot of correlation to Sale Price

par(mar=c(8,8,1,1))
barplot(topcorsdf$value, ylab="Correlation to Sale Price", ylim=c(0,1), col=rainbow(20), las=2, names.arg=topcorsdf$X1)

Variables with strongest correlation to Sale Price in descending order:

  • OverallQual
  • GrLivArea
  • GarageCars
  • GarageArea
  • TotalBsmtSF
  • X1stFlrSF
  • FullBath
  • TotRmsAbvGrd
  • YearBuilt
  • YearRemodAdd
cormatdata <- select(df, OverallQual, GrLivArea, GarageCars, GarageArea, TotalBsmtSF, X1stFlrSF, FullBath, TotRmsAbvGrd)
## Warning in combine_vars(vars, ind_list): '.Random.seed' is not an integer
## vector but of type 'NULL', so ignored
cormat1 <- cor(cormatdata)
cormat1
##              OverallQual GrLivArea GarageCars GarageArea TotalBsmtSF
## OverallQual    1.0000000 0.5930074  0.6006707  0.5620218   0.5378085
## GrLivArea      0.5930074 1.0000000  0.4672474  0.4689975   0.4548682
## GarageCars     0.6006707 0.4672474  1.0000000  0.8824754   0.4345848
## GarageArea     0.5620218 0.4689975  0.8824754  1.0000000   0.4866655
## TotalBsmtSF    0.5378085 0.4548682  0.4345848  0.4866655   1.0000000
## X1stFlrSF      0.4762238 0.5660240  0.4393168  0.4897817   0.8195300
## FullBath       0.5505997 0.6300116  0.4696720  0.4056562   0.3237224
## TotRmsAbvGrd   0.4274523 0.8254894  0.3622886  0.3378221   0.2855726
##              X1stFlrSF  FullBath TotRmsAbvGrd
## OverallQual  0.4762238 0.5505997    0.4274523
## GrLivArea    0.5660240 0.6300116    0.8254894
## GarageCars   0.4393168 0.4696720    0.3622886
## GarageArea   0.4897817 0.4056562    0.3378221
## TotalBsmtSF  0.8195300 0.3237224    0.2855726
## X1stFlrSF    1.0000000 0.3806375    0.4095160
## FullBath     0.3806375 1.0000000    0.5547843
## TotRmsAbvGrd 0.4095160 0.5547843    1.0000000
corrplot(cormat1, method="circle")

Part 6: Sampling

Calculus-Based Probability & Statistics. Many times, it makes sense to fit a closed form distribution to data.

For your variable that is skewed to the right, shift it so that the minimum value is above zero. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ).

Minimum value is above zero

#check that min val is not 0
min(df$YearBuilt)
## [1] 1872

Run fitdistr to fit an exponential probability density function.

fit <- fitdistr(df$YearBuilt, "exponential")

Find the optimal value of λ for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, λ)).

#optimal value of λ for this distribution

lambda <- fit$estimate
sampledf <- rexp(1000, lambda)
lambda
##         rate 
## 0.0005072877

Plot a histogram and compare it with a histogram of your original variable.

#Plot a histogram and compare it with a histogram of your original variable.

sampledf<-data.frame(as.numeric(sampledf))
colnames(sampledf)[1] <- "sample"
str(sampledf)
## 'data.frame':    1000 obs. of  1 variable:
##  $ sample: num  1253 2534 6874 6833 43724 ...
head(sampledf)
##       sample
## 1  1252.9546
## 2  2534.0248
## 3  6874.1797
## 4  6833.2280
## 5 43724.1191
## 6   275.5495
hist(sampledf$sample, col="green", main="Histogram of Exponential Distribution", xlab = "Year Built", breaks=30)

Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF).

#find the 5th and 95th percentiles
print("5th percentile")
## [1] "5th percentile"
qexp(.05,rate = lambda)
## [1] 101.1128
print("95th percentile")
## [1] "95th percentile"
qexp(.95, rate = lambda)
## [1] 5905.391

Also generate a 95% confidence interval from the empirical data, assuming normality.

#95% confidence interval from the empirical data
CI(df$YearBuilt, 0.95)
##    upper     mean    lower 
## 1972.818 1971.268 1969.717

Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

quantile(df$YearBuilt, .05)
##   5% 
## 1916
quantile(df$YearBuilt, .95)
##  95% 
## 2007

Part 7: Modeling

Modeling. Build some type of regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.

Test Model 1: AIC in a Stepwise Algorithm

#test of alternate model
modvars <- df[, which(sapply(df, function(x) sum(is.na(x))) == 0)]
model1 <- step(lm(df$SalePrice ~ ., modvars), direction = 'backward', trace = FALSE)
model1

#dfglm <- glm(df$SalePrice ~ ., family=binomial, data = df)
#dfstep <- stepAIC(dfglm, trace = FALSE)
#dfstep$anova

Test Model 2: Multiple Linear Regression

fit <- lm(df$SalePrice ~ df$OverallQual + df$GrLivArea + df$GarageCars + df$GarageArea, data=df)
summary(fit) # show results
## 
## Call:
## lm(formula = df$SalePrice ~ df$OverallQual + df$GrLivArea + df$GarageCars + 
##     df$GarageArea, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -372594  -21236   -1594   18625  301129 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -98436.050   4820.467 -20.420  < 2e-16 ***
## df$OverallQual  26988.854   1067.393  25.285  < 2e-16 ***
## df$GrLivArea       49.573      2.555  19.402  < 2e-16 ***
## df$GarageCars   11317.522   3126.297   3.620 0.000305 ***
## df$GarageArea      41.478     10.627   3.903 9.93e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 40420 on 1455 degrees of freedom
## Multiple R-squared:  0.7418, Adjusted R-squared:  0.7411 
## F-statistic:  1045 on 4 and 1455 DF,  p-value: < 2.2e-16

Using intercepts from regression summary, create multiple linear regression model.

\[ SalePrice = 26988.854*OverallQual + 49.573*GrLivArea + 11317.522*GarageCars + 41.478*GarageArea -98436.050 \]

par(mfrow=c(2,2))
X1<-df$OverallQual
X2<-df$GrLivArea
X3<-df$GarageCars
X4<-df$GarageArea
Y<-df$SalePrice

plot(X1,Y, col="#f06292", main="OverallQual", ylab="Sale Price")
abline(lm(Y~X1), col="yellow", lwd=3) # regression line (y~x)

plot(X2,Y, col="#9c27b0", main="GrLivArea", ylab="Sale Price")
abline(lm(Y~X2), col="yellow", lwd=3) # regression line (y~x)

plot(X3,Y, col="#ce93d8", main="GarageCars", ylab="Sale Price")
abline(lm(Y~X3), col="yellow", lwd=3) # regression line (y~x)

plot(X4,Y, col="#c2185b", main="GarageArea", ylab="Sale Price")
abline(lm(Y~X4), col="yellow", lwd=3) # regression line (y~x)

Load test data set and create calculated column using equation for multiple linear regression. Select required columns and export to csv for contest entry.

dftest <- read.csv("test.csv")
#str(dftest)
#nrow(dftest)

SalePrice<-((26988.854*df$OverallQual) + (49.573*df$GrLivArea) +  (11317.522*df$GarageCars) + (41.478*df$GarageArea) -98436.050)
#head(SalePrice)

dftest<-dftest[,c("Id","OverallQual","GrLivArea","GarageCars","GarageArea")]

kable(head(dftest))
Id OverallQual GrLivArea GarageCars GarageArea
1461 5 896 1 730
1462 6 1329 1 312
1463 5 1629 2 482
1464 6 1604 2 470
1465 8 1280 2 506
1466 6 1655 2 440
#tail(dftest)

submission <- cbind(dftest$Id,SalePrice)
## Warning in cbind(dftest$Id, SalePrice): number of rows of result is not a
## multiple of vector length (arg 1)
colnames(submission)[1] <- "Id"
submission[submission<0] <- median(SalePrice) #clear negatives due to missing values
submission<-as.data.frame(submission[1:1459,])
kable(head(submission))
Id SalePrice
1461 220620.7
1462 167773.1
1463 226877.0
1464 236184.2
1465 295064.4
1466 146571.1
#str(submission)
#dim(submission)

Export CSV and submit to Kaggle.

Eval set to FALSE for reader convenience.

write.csv(submission, file = "submissionAAP.csv", quote=FALSE, row.names=FALSE)

Kaggle score: 0.60114