DATA 605 Final Project

library(DT)
library(e1071)
library(MASS)

Load data:

housing <- read.csv("https://raw.githubusercontent.com/choudhury1023/DATA-605/master/Final%20Exam/train.csv", stringsAsFactors = FALSE)
datatable(housing, options = list(pageLength = 10))

Part 1: Pick variables

Pick one of the quantitative independent variables from the training data set (train.csv) , and define that variable as X. Pick SalePrice as the dependent variable, and define it as Y for the next analysis.

For my project I am picking BedroomAbvGr as my independent variable.

skewness(housing$BedroomAbvGr)

Define variables:

X <- housing$BedroomAbvGr
Y <- housing$SalePrice

Part 2: Probability

Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 4th quartile of the X variable, and the small letter “y” is estimated as the 2d quartile of the Y variable. Interpret the meaning of all probabilities.

\[a.\quad P(X>x | Y>y)\quad\quad b.\quad P(X>x, Y>y)\quad\quad c.\quad P(X<x | Y>y)\]

4th quartile of X & 2nd Quartile of Y

# 4th quartile of X
(x <- quantile(X, 1))

## 100% 
##    8

# 2nd Quartile of Y
(y <- quantile(Y, 0.50))

##    50% 
## 163000

# Total number of rows in the dataframe
(n<-(nrow(housing)))

## [1] 1460

# Number or rows where the SalePrice is higher then 2nd quantile (higer then 16300)
(ny<-nrow(subset(housing, Y > y)))

## [1] 728

$P(X>x | Y>y)$, probability of BedroomAbvGr is higer then 4th quantile (more then 8) given that SalePrice is higher then 2nd quantile (higer then 16300)

(pa <- nrow(subset(housing, X > x & Y > y))/ny)

## [1] 0

$P(X>x, Y>y)$, probability of both BedroomAbvGr is higer then 4th quantile (more then 8) and SalePrice is higher then 2nd quantile (higer then 16300)

(pb<-nrow(subset(housing, X > x & Y > y))/n)

## [1] 0

$P(X<x | Y>y)$, probability of BedroomAbvGr is lower then 4th quantile (less then 8) given that SalePrice is higher then 2nd quantile (higer then 16300), only possible sscenario out of the given 3

(pc <- nrow(subset(housing, X < x & Y > y))/ny)

## [1] 0.9986264

Does splitting the training data in this fashion make them independent? In other words, does $P(X|Y)=P(X)P(Y))?$ Check mathematically, and then evaluate by running a Chi Square test for association. You might have to research this

Splitting the training data in this fashion does not make them independent. Hence, $P(X|Y) \neq P(X)P(Y))$.

Contingency table

c1 <- nrow(subset(housing, X <=x & Y<=y))
c2 <- nrow(subset(housing, X <=x & Y>y))
c3 <- c1+c2
c4 <- nrow(subset(housing, X >x & Y<=y))
c5 <- nrow(subset(housing, X >x & Y>y))
c6 <- c4+c5
c7 <- c1+c4
c8 <- c2+c5
c9 <- c3+c6


dfcont<-matrix(round(c(c1, c2, c3, c4, c5, c6, c7, c8, c9), 3), ncol=3, nrow=3, byrow=TRUE)
colnames(dfcont) <-c (
"Y<=y",
"Y>y",
"Total")
rownames(dfcont) <-c ("X<=x","X>x","Total")

(dfcont <-  knitr::kable(as.table(dfcont)))

	Y<=y	Y>y	Total
X<=x	732	728	1460
X>x	0	0	0
Total	732	728	1460

To check mathematically and evaluate by running a Chi square test for associaton I will use a lower bound quartile then 4th quartile as we can see the Contingency table contains 0 (zero) values when we use the 4th qurtile.

3rd quartile of X

(x <- quantile(X, 0.75))

## 75% 
##   3

Contingency table using small letter “x” as the 3rd quartile of the X variable

c1 <- nrow(subset(housing, X <=x & Y<=y))
c2 <- nrow(subset(housing, X <=x & Y>y))
c3 <- c1+c2
c4 <- nrow(subset(housing, X >x & Y<=y))
c5 <- nrow(subset(housing, X >x & Y>y))
c6 <- c4+c5
c7 <- c1+c4
c8 <- c2+c5
c9 <- c3+c6


dfcont1 <- matrix(round(c(c1, c2, c3, c4, c5, c6, c7 ,c8 ,c9), 3), ncol=3, nrow=3, byrow=TRUE)
colnames(dfcont1) <-c (
"Y<=y",
"Y>y",
"Total")
rownames(dfcont1) <-c ("X<=x","X>x","Total")

(dfcont1 <- knitr::kable(as.table(dfcont1)))

	Y<=y	Y>y	Total
X<=x	639	579	1218
X>x	93	149	242
Total	732	728	1460

\[P(X|Y) = 149/728 = 0.2046703\]

\[P(X) = 242/1460 = 0.1657534, \quad P(Y) = 728/1460 = 0.49863014\] \[P(X)P(Y) \quad = \quad 0.1657534 \times0.49863014 \quad = \quad 0.8264964\]

\[\therefore P(X|Y) \neq P(X)P(Y))\]

Chi square test

mat <- matrix(c(639, 579, 93, 149), 2, 2, byrow=T) 

chisq.test(mat, correct=TRUE)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  mat
## X-squared = 15.347, df = 1, p-value = 8.946e-05

From our Chi square test we can see the p-value is very small which suggest to reject $H_{0}$, in other words the data is not independent.

Part 3: Descriptive and Inferential Statistics

Provide univariate descriptive statistics and appropriate plots for both variables. Provide a scatterplot of X and Y. Transform both variables simultaneously using Box-Cox transformations. You might have to research this. Using the transformed variables, run a correlation analysis and interpret. Test the hypothesis that the correlation between these variables is 0 and provide a 99% confidence interval. Discuss the meaning of your analysis.

my_df <- cbind.data.frame(X, Y)
colnames(my_df) <- c("BedroomAbvGr", "SalePrice")

summary(my_df)

##   BedroomAbvGr     SalePrice     
##  Min.   :0.000   Min.   : 34900  
##  1st Qu.:2.000   1st Qu.:129975  
##  Median :3.000   Median :163000  
##  Mean   :2.866   Mean   :180921  
##  3rd Qu.:3.000   3rd Qu.:214000  
##  Max.   :8.000   Max.   :755000

skewness(my_df$BedroomAbvGr)

## [1] 0.2113551

quantile(my_df$BedroomAbvGr)

##   0%  25%  50%  75% 100% 
##    0    2    3    3    8

quantile(my_df$SalePrice)

##     0%    25%    50%    75%   100% 
##  34900 129975 163000 214000 755000

par(mfrow=c(2, 2))
hist(my_df$BedroomAbvGr, col = "blue")
boxplot(my_df$BedroomAbvGr, main="Boxplot LotArea")
qqnorm(my_df$BedroomAbvGr)
qqline(my_df$BedroomAbvGr)


par(mfrow=c(2, 2))

hist(my_df$SalePrice, col = "green")
boxplot(my_df$SalePrice, main="Boxplot LotArea")
qqnorm(my_df$SalePrice)
qqline(my_df$SalePrice)

mod = lm(SalePrice ~ BedroomAbvGr, data = my_df)
summary(mod)

## 
## Call:
## lm(formula = SalePrice ~ BedroomAbvGr, data = my_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -143109  -52672  -17719   31891  555510 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    133966       7492  17.881  < 2e-16 ***
## BedroomAbvGr    16381       2514   6.516 9.93e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 78340 on 1458 degrees of freedom
## Multiple R-squared:  0.0283, Adjusted R-squared:  0.02763 
## F-statistic: 42.46 on 1 and 1458 DF,  p-value: 9.927e-11

plot(my_df$BedroomAbvGr, my_df$SalePrice, main = "Scatterplot SalePrice by BedroomAbvGr ")
abline(lm(my_df$SalePrice ~ my_df$BedroomAbvGr), col="red", lwd=3)

plot(mod, pch=16, which=1)

From the model and scatter plot above we can see the distribution for both the variables are not normal, the relationship between the two variable do not follow a straight line which indicates the linear relationship between the variable is unlikely . We can also see the p-value is really small, almost close to zero, we therefore reject the null hypothesis. $R^2$ value of 0.0283 indicates only 2.83% of the variance in SalePrice can be explained by BedroomAbvGr. Residual standard error of 78340 tells us any prediction in SalePrice based on BedroomAbvGr will be off by $78,340, which is a significantly large variance. The residuals against the predicted values plot supports the lack of model fit.

Box-Cox Transformation

Box-Cox gives us the lambda value to wich the variables to be raised in order to get them normally distributed.

trans <- boxcox(mod)

trans_df <- as.data.frame(trans)

optimal_lambda <- trans_df[which.max(trans$y),1]

transdata = cbind( my_df,my_df$BedroomAbvGr^optimal_lambda, my_df$SalePrice^optimal_lambda)
names(transdata)[3] = "BedroomAbvGr_transf"
names(transdata)[4] = "SalePrice_transf"
head(transdata,5)

##   BedroomAbvGr SalePrice BedroomAbvGr_transf SalePrice_transf
## 1            3    208500           0.8949648        0.2902128
## 2            3    181500           0.8949648        0.2943068
## 3            3    223500           0.8949648        0.2881834
## 4            3    140000           0.8949648        0.3021267
## 5            4    250000           0.8693324        0.2849401

summary(transdata)

##   BedroomAbvGr     SalePrice      BedroomAbvGr_transf SalePrice_transf
##  Min.   :0.000   Min.   : 34900   Min.   :0.8105      Min.   :0.2548  
##  1st Qu.:2.000   1st Qu.:129975   1st Qu.:0.8950      1st Qu.:0.2895  
##  Median :3.000   Median :163000   Median :0.8950      Median :0.2975  
##  Mean   :2.866   Mean   :180921   Mean   :   Inf      Mean   :0.2971  
##  3rd Qu.:3.000   3rd Qu.:214000   3rd Qu.:0.9324      3rd Qu.:0.3044  
##  Max.   :8.000   Max.   :755000   Max.   :   Inf      Max.   :0.3476

hist(transdata$BedroomAbvGr_transf, col = "blue", main = "Historgram of BedroomAbvGr Transformed by Box-Cox")

hist(transdata$SalePrice_transf, col = "green", main = "Historgram of SalePrice Transformed by Box-Cox")

mod2 = lm(SalePrice_transf ~ BedroomAbvGr, data = transdata)
summary(mod2)

## 
## Call:
## lm(formula = SalePrice_transf ~ BedroomAbvGr, data = transdata)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.039468 -0.007354  0.000465  0.007729  0.047853 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.3060230  0.0011188 273.534  < 2e-16 ***
## BedroomAbvGr -0.0031183  0.0003754  -8.307 2.23e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0117 on 1458 degrees of freedom
## Multiple R-squared:  0.04519,    Adjusted R-squared:  0.04453 
## F-statistic:    69 on 1 and 1458 DF,  p-value: 2.225e-16

plot(mod2, pch=16, which=1)

From the plots above we can see Box-Cox give the variables a near normally distributed look and the $R^2$ value is now 0.04519 which is a slight increase but the relationship between the two variable being linear remains week.

correlation and 99% Confidence Interval

cor.test(transdata[,"BedroomAbvGr"], transdata[,"SalePrice_transf"], conf.level = .99)

## 
##  Pearson's product-moment correlation
## 
## data:  transdata[, "BedroomAbvGr"] and transdata[, "SalePrice_transf"]
## t = -8.3065, df = 1458, p-value = 2.225e-16
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
##  -0.2759955 -0.1472990
## sample estimates:
##        cor 
## -0.2125689

The correlation after treansformation is -0.2125689. The p-value is nearly 0, we therefore reject the null hypothesis. The 99% confidence interval the the correlation between the two intervals is ) is: -0.2759955, -0.1472990. These interpret that there is a week negative linear relationship between the two variables.

Part 4: Linear Algebra and Correlation

Invert your correlation matrix. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix.

##correlation matrix
##original data
cor_data <- cor(my_df[,c("BedroomAbvGr","SalePrice")])
cor_data

##              BedroomAbvGr SalePrice
## BedroomAbvGr    1.0000000 0.1682132
## SalePrice       0.1682132 1.0000000

##correlation matrix
##transformed data
cor_transdata <- cor(transdata[,c("BedroomAbvGr","SalePrice_transf")])
cor_transdata

##                  BedroomAbvGr SalePrice_transf
## BedroomAbvGr        1.0000000       -0.2125689
## SalePrice_transf   -0.2125689        1.0000000

##precision matrix
##original data
pre_data <- solve(cor_data)
pre_data

##              BedroomAbvGr  SalePrice
## BedroomAbvGr    1.0291196 -0.1731115
## SalePrice      -0.1731115  1.0291196

##precision matrix
##transformed data
pre_trans <- solve(cor_transdata)
pre_trans

##                  BedroomAbvGr SalePrice_transf
## BedroomAbvGr        1.0473239        0.2226285
## SalePrice_transf    0.2226285        1.0473239

##correlation matrix multiplied by precision matrix
##original data
cor_data %*% pre_data

##              BedroomAbvGr SalePrice
## BedroomAbvGr 1.000000e+00         0
## SalePrice    2.775558e-17         1

##correlation matrix multiplied by precision matrix
##transformed data
cor_transdata %*% pre_trans

##                   BedroomAbvGr SalePrice_transf
## BedroomAbvGr      1.000000e+00                0
## SalePrice_transf -2.775558e-17                1

##precision matrix multiplied by correlation matrix
##original data
pre_data %*% cor_data

##              BedroomAbvGr SalePrice
## BedroomAbvGr 1.000000e+00         0
## SalePrice    2.775558e-17         1

##precision matrix multiplied by correlation matrix
##transformed data
pre_trans %*% cor_transdata

##                   BedroomAbvGr SalePrice_transf
## BedroomAbvGr      1.000000e+00                0
## SalePrice_transf -2.775558e-17                1

Part 6: Calculus-Based Probability & Statistics

Many times, it makes sense to fit a closed form distribution to data. For your non-transformed independent variable, location shift it so that the minimum value is above zero. Then load the MASS package and run fitdistr to fit a density function of your choice. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of the parameters for this distribution, and then take 1000 samples from this distribution (e.g., rexp(1000, ???) for an exponential). Plot a histogram and compare it with a histogram of your non-transformed original variable.

##shift and find minimum value of chosen variable

BedroomAbvGr <- my_df$BedroomAbvGr + 1e-32
min(BedroomAbvGr)

## [1] 1e-32

(fit <- fitdistr(BedroomAbvGr, "exponential"))

##       rate    
##   0.348864994 
##  (0.009130214)

(lambda <- fit$estimate)

##     rate 
## 0.348865

samp <- rexp(1000, lambda)

## Warning in rexp(1000, lambda): '.Random.seed' is not an integer vector but
## of type 'NULL', so ignored

par(mfrow=c(1, 2))
hist(samp, xlab = "BedroomAbvGr", main = "Simulated")
hist(my_df$BedroomAbvGr, xlab = "BedroomAbvGr", main = "Observed")

There are differences between the simulated and observed data, visually from the histograms we can see the simulated data is heavily skwed to the right where as the observed data is concentrated more in the center.

Part 7:Modeling

Build some type of regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.

For my modeling I will use multiple linear regression. I am using numeric variables only and which has correlation higher then 0.5.

#Create dataframe with numeric variable only
quantVar <- sapply(housing, is.numeric)
quantVar_df <- housing[ , quantVar]
head(quantVar_df)

##   Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt
## 1  1         60          65    8450           7           5      2003
## 2  2         20          80    9600           6           8      1976
## 3  3         60          68   11250           7           5      2001
## 4  4         70          60    9550           7           5      1915
## 5  5         60          84   14260           8           5      2000
## 6  6         50          85   14115           5           5      1993
##   YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1         2003        196        706          0       150         856
## 2         1976          0        978          0       284        1262
## 3         2002        162        486          0       434         920
## 4         1970          0        216          0       540         756
## 5         2000        350        655          0       490        1145
## 6         1995          0        732          0        64         796
##   X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath
## 1       856       854            0      1710            1            0
## 2      1262         0            0      1262            0            1
## 3       920       866            0      1786            1            0
## 4       961       756            0      1717            1            0
## 5      1145      1053            0      2198            1            0
## 6       796       566            0      1362            1            0
##   FullBath HalfBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd Fireplaces
## 1        2        1            3            1            8          0
## 2        2        0            3            1            6          1
## 3        2        1            3            1            6          1
## 4        1        0            3            1            7          1
## 5        2        1            4            1            9          1
## 6        1        1            1            1            5          0
##   GarageYrBlt GarageCars GarageArea WoodDeckSF OpenPorchSF EnclosedPorch
## 1        2003          2        548          0          61             0
## 2        1976          2        460        298           0             0
## 3        2001          2        608          0          42             0
## 4        1998          3        642          0          35           272
## 5        2000          3        836        192          84             0
## 6        1993          2        480         40          30             0
##   X3SsnPorch ScreenPorch PoolArea MiscVal MoSold YrSold SalePrice
## 1          0           0        0       0      2   2008    208500
## 2          0           0        0       0      5   2007    181500
## 3          0           0        0       0      9   2008    223500
## 4          0           0        0       0      2   2006    140000
## 5          0           0        0       0     12   2008    250000
## 6        320           0        0     700     10   2009    143000

#Find correlation between SalePrice and other numeric variable
corSales <-data.frame(apply(quantVar_df,2, function(col)cor(col, quantVar_df$SalePrice, use = "complete.obs")))
colnames(corSales) <- c("cor")
corSales

##                       cor
## Id            -0.02191672
## MSSubClass    -0.08428414
## LotFrontage    0.35179910
## LotArea        0.26384335
## OverallQual    0.79098160
## OverallCond   -0.07785589
## YearBuilt      0.52289733
## YearRemodAdd   0.50710097
## MasVnrArea     0.47749305
## BsmtFinSF1     0.38641981
## BsmtFinSF2    -0.01137812
## BsmtUnfSF      0.21447911
## TotalBsmtSF    0.61358055
## X1stFlrSF      0.60585218
## X2ndFlrSF      0.31933380
## LowQualFinSF  -0.02560613
## GrLivArea      0.70862448
## BsmtFullBath   0.22712223
## BsmtHalfBath  -0.01684415
## FullBath       0.56066376
## HalfBath       0.28410768
## BedroomAbvGr   0.16821315
## KitchenAbvGr  -0.13590737
## TotRmsAbvGrd   0.53372316
## Fireplaces     0.46692884
## GarageYrBlt    0.48636168
## GarageCars     0.64040920
## GarageArea     0.62343144
## WoodDeckSF     0.32441344
## OpenPorchSF    0.31585623
## EnclosedPorch -0.12857796
## X3SsnPorch     0.04458367
## ScreenPorch    0.11144657
## PoolArea       0.09240355
## MiscVal       -0.02118958
## MoSold         0.04643225
## YrSold        -0.02892259
## SalePrice      1.00000000

(subset(corSales, cor > 0.5))

##                    cor
## OverallQual  0.7909816
## YearBuilt    0.5228973
## YearRemodAdd 0.5071010
## TotalBsmtSF  0.6135806
## X1stFlrSF    0.6058522
## GrLivArea    0.7086245
## FullBath     0.5606638
## TotRmsAbvGrd 0.5337232
## GarageCars   0.6404092
## GarageArea   0.6234314
## SalePrice    1.0000000

model <- lm(SalePrice ~ OverallQual + YearBuilt + YearRemodAdd + TotalBsmtSF + X1stFlrSF + GrLivArea + FullBath + TotRmsAbvGrd + GarageCars + GarageArea, data = housing)

summary(model)

## 
## Call:
## lm(formula = SalePrice ~ OverallQual + YearBuilt + YearRemodAdd + 
##     TotalBsmtSF + X1stFlrSF + GrLivArea + FullBath + TotRmsAbvGrd + 
##     GarageCars + GarageArea, data = housing)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -489958  -19316   -1948   16020  290558 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.186e+06  1.291e+05  -9.187  < 2e-16 ***
## OverallQual   1.960e+04  1.190e+03  16.472  < 2e-16 ***
## YearBuilt     2.682e+02  5.035e+01   5.328 1.15e-07 ***
## YearRemodAdd  2.965e+02  6.363e+01   4.659 3.47e-06 ***
## TotalBsmtSF   1.986e+01  4.295e+00   4.625 4.09e-06 ***
## X1stFlrSF     1.417e+01  4.930e+00   2.875 0.004097 ** 
## GrLivArea     5.130e+01  4.233e+00  12.119  < 2e-16 ***
## FullBath     -6.791e+03  2.682e+03  -2.532 0.011457 *  
## TotRmsAbvGrd  3.310e+01  1.119e+03   0.030 0.976404    
## GarageCars    1.042e+04  3.044e+03   3.422 0.000639 ***
## GarageArea    1.495e+01  1.031e+01   1.450 0.147384    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 37920 on 1449 degrees of freedom
## Multiple R-squared:  0.7737, Adjusted R-squared:  0.7721 
## F-statistic: 495.4 on 10 and 1449 DF,  p-value: < 2.2e-16

we have a $R^2$ value of 0.7736928, 77.36% of the variance can be explained by this model.

##Load Test data
test <- read.csv("https://raw.githubusercontent.com/choudhury1023/DATA-605/master/Final%20Exam/test.csv")

##predict SalePrice
mySalePrice <- predict(model,test)

##create dataframe
prediction <- data.frame( Id = test[,"Id"],  SalePrice = mySalePrice)
prediction[prediction<0] <- 0
prediction <- replace(prediction,is.na(prediction),0)
  
head(prediction)

##     Id SalePrice
## 1 1461  110135.9
## 2 1462  159060.0
## 3 1463  169683.7
## 4 1464  188059.7
## 5 1465  219782.0
## 6 1466  182152.0

##write .csv for submission
write.csv(prediction, file="prediction.csv", row.names = FALSE)

Kaggle Score

My initial submit to kaggle(user name: choudhury1023) was unsuccessful as it had NA values, my second submit was successful however the score is really low (0.85356). I need to tweak the model more or perhaps use a different regression model to improve the score.