DATA_605_Final

** DATA_605 Final Exam - Computational Mathematics - Spring 2018 **

if (!require(stats)) install.packages("stats")
library(stats)

if (!require(pracma)) install.packages("pracma")

## Loading required package: pracma

## Warning: package 'pracma' was built under R version 3.3.3

library(pracma)

if (!require(Deriv)) install.packages("Deriv")

## Loading required package: Deriv

## Warning: package 'Deriv' was built under R version 3.3.3

library(Deriv)

if (!require(knitr)) install.packages("knitr")

## Loading required package: knitr

library(knitr)

if (!require(DT)) install.packages("DT")

## Loading required package: DT

library(DT)

if (!require(MASS)) install.packages("MASS")

## Loading required package: MASS

library(MASS)

if (!require(reshape)) install.packages("reshape")

## Loading required package: reshape

## Warning: package 'reshape' was built under R version 3.3.3

library(reshape)

if (!require(ggplot2)) install.packages("ggplot2")

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 3.3.3

library(ggplot2)

if (!require(dplyr)) install.packages("dplyr")

## Loading required package: dplyr

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:reshape':
## 
##     rename

## The following object is masked from 'package:MASS':
## 
##     select

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(dplyr)

if (!require(tidyr)) install.packages("tidyr")

## Loading required package: tidyr

## 
## Attaching package: 'tidyr'

## The following objects are masked from 'package:reshape':
## 
##     expand, smiths

library(tidyr)

if (!require(pastecs)) install.packages("pastecs")

## Loading required package: pastecs

## Warning: package 'pastecs' was built under R version 3.3.3

## 
## Attaching package: 'pastecs'

## The following object is masked from 'package:tidyr':
## 
##     extract

## The following objects are masked from 'package:dplyr':
## 
##     first, last

library(pastecs)

if (!require(psych)) install.packages("psych")

## Loading required package: psych

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

## The following objects are masked from 'package:pracma':
## 
##     logit, polar

library(psych)

if (!require(purrr)) install.packages("purrr")

## Loading required package: purrr

## Warning: package 'purrr' was built under R version 3.3.3

## 
## Attaching package: 'purrr'

## The following object is masked from 'package:pracma':
## 
##     cross

library(psych)

if (!require(corrplot)) install.packages("corrplot")

## Loading required package: corrplot

## Warning: package 'corrplot' was built under R version 3.3.3

## corrplot 0.84 loaded

library(corrplot)

if (!require(Matrix)) install.packages("Matrix")

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following object is masked from 'package:tidyr':
## 
##     expand

## The following object is masked from 'package:reshape':
## 
##     expand

## The following objects are masked from 'package:pracma':
## 
##     expm, lu, tril, triu

library(Matrix)

if (!require(Rmisc)) install.packages("Rmisc")

## Loading required package: Rmisc

## Warning: package 'Rmisc' was built under R version 3.3.3

## Loading required package: lattice

## Loading required package: plyr

## -------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## -------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following object is masked from 'package:purrr':
## 
##     compact

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following objects are masked from 'package:reshape':
## 
##     rename, round_any

library(Rmisc)

Final Exam:

You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques . I want you to do the following.

** Data **

Pick one of the quantitative independent variables from the training data set (train.csv) , and define that variable as X. Make sure this variable is skewed to the right!
Pick the dependent variable and define it as Y.

url <- 'https://raw.githubusercontent.com/danthonn/DATA_605_Final_DThonn/master/train.csv'


train1 <- read.csv(url, header = TRUE, stringsAsFactors = FALSE)
#head(train1)
glimpse(train1)

## Observations: 1,460
## Variables: 81
## $ Id            <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...
## $ MSSubClass    <int> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60,...
## $ MSZoning      <chr> "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RL", ...
## $ LotFrontage   <int> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, ...
## $ LotArea       <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10...
## $ Street        <chr> "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", ...
## $ Alley         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ LotShape      <chr> "Reg", "Reg", "IR1", "IR1", "IR1", "IR1", "Reg",...
## $ LandContour   <chr> "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl",...
## $ Utilities     <chr> "AllPub", "AllPub", "AllPub", "AllPub", "AllPub"...
## $ LotConfig     <chr> "Inside", "FR2", "Inside", "Corner", "FR2", "Ins...
## $ LandSlope     <chr> "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl",...
## $ Neighborhood  <chr> "CollgCr", "Veenker", "CollgCr", "Crawfor", "NoR...
## $ Condition1    <chr> "Norm", "Feedr", "Norm", "Norm", "Norm", "Norm",...
## $ Condition2    <chr> "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", ...
## $ BldgType      <chr> "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", ...
## $ HouseStyle    <chr> "2Story", "1Story", "2Story", "2Story", "2Story"...
## $ OverallQual   <int> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, ...
## $ OverallCond   <int> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, ...
## $ YearBuilt     <int> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, ...
## $ YearRemodAdd  <int> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, ...
## $ RoofStyle     <chr> "Gable", "Gable", "Gable", "Gable", "Gable", "Ga...
## $ RoofMatl      <chr> "CompShg", "CompShg", "CompShg", "CompShg", "Com...
## $ Exterior1st   <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Sdng", "Vin...
## $ Exterior2nd   <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Shng", "Vin...
## $ MasVnrType    <chr> "BrkFace", "None", "BrkFace", "None", "BrkFace",...
## $ MasVnrArea    <int> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, ...
## $ ExterQual     <chr> "Gd", "TA", "Gd", "TA", "Gd", "TA", "Gd", "TA", ...
## $ ExterCond     <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", ...
## $ Foundation    <chr> "PConc", "CBlock", "PConc", "BrkTil", "PConc", "...
## $ BsmtQual      <chr> "Gd", "Gd", "Gd", "TA", "Gd", "Gd", "Ex", "Gd", ...
## $ BsmtCond      <chr> "TA", "TA", "TA", "Gd", "TA", "TA", "TA", "TA", ...
## $ BsmtExposure  <chr> "No", "Gd", "Mn", "No", "Av", "No", "Av", "Mn", ...
## $ BsmtFinType1  <chr> "GLQ", "ALQ", "GLQ", "ALQ", "GLQ", "GLQ", "GLQ",...
## $ BsmtFinSF1    <int> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851,...
## $ BsmtFinType2  <chr> "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "Unf",...
## $ BsmtFinSF2    <int> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ BsmtUnfSF     <int> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140,...
## $ TotalBsmtSF   <int> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952,...
## $ Heating       <chr> "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", ...
## $ HeatingQC     <chr> "Ex", "Ex", "Ex", "Gd", "Ex", "Ex", "Ex", "Ex", ...
## $ CentralAir    <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y"...
## $ Electrical    <chr> "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SB...
## $ X1stFlrSF     <int> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022...
## $ X2ndFlrSF     <int> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, ...
## $ LowQualFinSF  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ GrLivArea     <int> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, ...
## $ BsmtFullBath  <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, ...
## $ BsmtHalfBath  <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FullBath      <int> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, ...
## $ HalfBath      <int> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, ...
## $ BedroomAbvGr  <int> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, ...
## $ KitchenAbvGr  <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, ...
## $ KitchenQual   <chr> "Gd", "TA", "Gd", "Gd", "Gd", "TA", "Gd", "TA", ...
## $ TotRmsAbvGrd  <int> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5,...
## $ Functional    <chr> "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ",...
## $ Fireplaces    <int> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, ...
## $ FireplaceQu   <chr> NA, "TA", "TA", "Gd", "TA", NA, "Gd", "TA", "TA"...
## $ GarageType    <chr> "Attchd", "Attchd", "Attchd", "Detchd", "Attchd"...
## $ GarageYrBlt   <int> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, ...
## $ GarageFinish  <chr> "RFn", "RFn", "RFn", "Unf", "RFn", "Unf", "RFn",...
## $ GarageCars    <int> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, ...
## $ GarageArea    <int> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205...
## $ GarageQual    <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", ...
## $ GarageCond    <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", ...
## $ PavedDrive    <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y"...
## $ WoodDeckSF    <int> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, ...
## $ OpenPorchSF   <int> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, ...
## $ EnclosedPorch <int> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, ...
## $ X3SsnPorch    <int> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ ScreenPorch   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0...
## $ PoolArea      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ PoolQC        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ Fence         <chr> NA, NA, NA, NA, NA, "MnPrv", NA, NA, NA, NA, NA,...
## $ MiscFeature   <chr> NA, NA, NA, NA, NA, "Shed", NA, "Shed", NA, NA, ...
## $ MiscVal       <int> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0,...
## $ MoSold        <int> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, ...
## $ YrSold        <int> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, ...
## $ SaleType      <chr> "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", ...
## $ SaleCondition <chr> "Normal", "Normal", "Normal", "Abnorml", "Normal...
## $ SalePrice     <int> 208500, 181500, 223500, 140000, 250000, 143000, ...

# independent variable
X <- train1$GrLivArea
head(X)

## [1] 1710 1262 1786 1717 2198 1362

# dependent variable
Y <- train1$SalePrice
head(Y)

## [1] 208500 181500 223500 140000 250000 143000

plot(X,Y, col="#4caf50", main="Scatterplot GrLivArea and Sale Price", xlab = "Gr Liv Area", ylab="Sale Price")
abline(lm(Y~X), col="red", lwd=3) # regression line (y~x)

# Histograms and summaries of X (GrLivArea) and Y (Sales Price)

# GrLivArea - X
hist(X, col="yellow", main="Histogram - GrLivArea", xlab = "GrLivArea")

# Summary of x - GrLivArea
summary(X)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     334    1130    1464    1515    1777    5642

# Sale Price
hist(Y, col="blue", main="Histogram - Sale Price", xlab = "Sale Price")

# Summary of y - Sale Price
summary(Y)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  130000  163000  180900  214000  755000

** Probability **

Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 1st quartile of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities. In addition, make a table of counts as shown below.

P(X>x | Y>y) b. P(X>x, Y>y) c. P(Xy) x/y

\[a. P(X > x | Y > y)\]

 #1st quartile of X variable
x<-quantile(X, probs=0.25) 
x2<-quantile(X, probs=0.50)
x

##    25% 
## 1129.5

# 25% 
# 1129.5 

#1st quartile of Y variable
y<-quantile(Y, probs=0.25)
y3<-quantile(Y, probs=0.75)
y

##    25% 
## 129975

#    25% 
# 129975 

# number of rows of train1 data set (train.csv)
num_len<-(nrow(train1))
num_len

## [1] 1460

# data for train1$GrLivArea
gr_liv_area<-as.numeric(train1$GrLivArea)
# data for train1$SalePrice
sale_price<-as.numeric(train1$SalePrice)

# rows where sale_price > 1st quartile of Y
n_Yqt1<-nrow(subset(train1,sale_price>y))
n_Yqt1

## [1] 1095

# the probability that P(X>x|Y>y)
pa<-nrow(subset(train1, gr_liv_area > x & sale_price> y))/n_Yqt1
pa

## [1] 0.8712329

#Answer-a:
# [1] 0.8712329

\[b. P(X > x , Y > y)\]

pb<-nrow(subset(train1, gr_liv_area > x & sale_price> y))/num_len
pb

## [1] 0.6534247

# Answer-b:
# [1] 0.6534247

\[c. P(X < x | Y > y)\]

pc <-nrow(subset(train1, gr_liv_area < x & sale_price> y))/n_Yqt1
pc

## [1] 0.1287671

#Answer-c:
# [1] 0.1287671

** Probability - Check Independence **

Does splitting the training data in this fashion make them independent? Let A be the new variable counting those observations above the 1st quartile for X, and let B be the new variable counting those observations above the 1st quartile for Y. Does P(AB)=P(A)P(B)? Check mathematically, and then evaluate by running a Chi Square test for association.

# check independance
subof_x <- subset(train1 , !is.na(gr_liv_area) & (gr_liv_area > x))
subof_y <- subset(train1 , !is.na(sale_price) & sale_price > y)
subof_xy <- subset(train1 , !is.na(gr_liv_area | sale_price) &(gr_liv_area < x & sale_price > y) )

pa <- nrow(subof_x)/nrow(train1)
pb <- nrow(subof_y)/nrow(train1)
pab <- nrow(subof_xy)/nrow(train1)

pab == pa*pb

## [1] FALSE

# [1] FALSE
# Conclusion: splitting the date in this fashion does not make them independent

** Probability - Table of Counts **

col1<-nrow(subset(train1, gr_liv_area <=x2 & sale_price<=y3))/num_len
col2<-nrow(subset(train1, gr_liv_area <=x2 & sale_price>y3))/num_len
col3<-nrow(subset(train1, gr_liv_area >x2 & sale_price<=y3))/num_len
col4<-nrow(subset(train1, gr_liv_area >x2 & sale_price>y3))/num_len

counts<-matrix(round(c(col1,col2,col3,col4),3), ncol=2, nrow=2, byrow=TRUE)
colnames(counts)<-c(
"<=2 quartile",
">2 quartile")
rownames(counts)<-c("<=3rd quartile",">3rd quartile")

print("Table of Counts")

## [1] "Table of Counts"

## [1] "Table of Counts"

counts<-as.table(counts)
addmargins(counts)

##                <=2 quartile >2 quartile   Sum
## <=3rd quartile        0.491       0.010 0.501
## >3rd quartile         0.261       0.238 0.499
## Sum                   0.752       0.248 1.000

** Probability - Chi Square Test **

# Chi Square Test
(chi_table <- table(X > x, Y > y))

##        
##         FALSE TRUE
##   FALSE   224  141
##   TRUE    141  954

prop.table(chi_table)

##        
##              FALSE       TRUE
##   FALSE 0.15342466 0.09657534
##   TRUE  0.09657534 0.65342466

(chi_results <- chisq.test(chi_table))

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  chi_table
## X-squared = 340.75, df = 1, p-value < 2.2e-16

# Pearson's Chi-squared test with Yates' continuity correction
# 
# data:  chi_table
# X-squared = 340.75, df = 1, p-value < 2.2e-16

# As the p-value 2.2e-16 is less than the .05 significance level, the null hypothesis is rejected and the conclusion is that the SalePrice of the GrLivArea are dependent.

** Statistics - Descriptive and Inferential - Plots **

Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot of X and Y. Derive a correlation matrix for any THREE quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide a 92% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

XY <- cbind(X, Y)

# Descriptive Statistics
options(scipen = 999)
kable(t(round(describe(XY, quant = c(0.25, 0.75)), 2)))

	X	Y
vars	1.00	2.00
n	1460.00	1460.00
mean	1515.46	180921.20
sd	525.48	79442.50
median	1464.00	163000.00
trimmed	1467.67	170783.29
mad	483.33	56338.80
min	334.00	34900.00
max	5642.00	755000.00
range	5308.00	720100.00
skew	1.36	1.88
kurtosis	4.86	6.50
se	13.75	2079.11
Q0.25	1129.50	129975.00
Q0.75	1776.75	214000.00

# Prepare data frame for plotting 
XY_df1 <- data.frame(XY)

# Provide a scatterplot of X and Y 
z <- ggplot(XY_df1, aes(X, Y))
z + geom_point()

# X 
hist(XY_df1$X, freq = FALSE, breaks = 25, main = "Variable X")
xfx <- seq(min(XY_df1$X), max(XY_df1$X), by = 1)
yfx <- dnorm(xfx, mean(XY_df1$X), sd(XY_df1$X))
lines(xfx, yfx)

# Y 
hist(XY_df1$Y, freq = FALSE, breaks = 30, main = "Variable Y")
xfy <- seq(min(XY_df1$Y), max(XY_df1$Y), by = 10)
yfy <- dnorm(xfy, mean(XY_df1$Y), sd(XY_df1$Y))
lines(xfy, yfy)

XYmod1 <- lm(Y ~ X, data = XY_df1)

# plot the XYmod1
plot(XYmod1, which = 2)

# plot the residuals
sresid <- studres(XYmod1)
hist(sresid, freq = FALSE, breaks = 25, main = "Distribution of Residuals")
xfs <- seq(min(sresid), max(sresid), length = 50)
yfs <- dnorm(xfs)
lines(xfs, yfs)

** Statistics - Descriptive and Inferential - Correlation Matrices **

Derive a correlation matrix for any THREE quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide a 92% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

# Check Correlations 

#head(train1)
colnames(train1)

##  [1] "Id"            "MSSubClass"    "MSZoning"      "LotFrontage"  
##  [5] "LotArea"       "Street"        "Alley"         "LotShape"     
##  [9] "LandContour"   "Utilities"     "LotConfig"     "LandSlope"    
## [13] "Neighborhood"  "Condition1"    "Condition2"    "BldgType"     
## [17] "HouseStyle"    "OverallQual"   "OverallCond"   "YearBuilt"    
## [21] "YearRemodAdd"  "RoofStyle"     "RoofMatl"      "Exterior1st"  
## [25] "Exterior2nd"   "MasVnrType"    "MasVnrArea"    "ExterQual"    
## [29] "ExterCond"     "Foundation"    "BsmtQual"      "BsmtCond"     
## [33] "BsmtExposure"  "BsmtFinType1"  "BsmtFinSF1"    "BsmtFinType2" 
## [37] "BsmtFinSF2"    "BsmtUnfSF"     "TotalBsmtSF"   "Heating"      
## [41] "HeatingQC"     "CentralAir"    "Electrical"    "X1stFlrSF"    
## [45] "X2ndFlrSF"     "LowQualFinSF"  "GrLivArea"     "BsmtFullBath" 
## [49] "BsmtHalfBath"  "FullBath"      "HalfBath"      "BedroomAbvGr" 
## [53] "KitchenAbvGr"  "KitchenQual"   "TotRmsAbvGrd"  "Functional"   
## [57] "Fireplaces"    "FireplaceQu"   "GarageType"    "GarageYrBlt"  
## [61] "GarageFinish"  "GarageCars"    "GarageArea"    "GarageQual"   
## [65] "GarageCond"    "PavedDrive"    "WoodDeckSF"    "OpenPorchSF"  
## [69] "EnclosedPorch" "X3SsnPorch"    "ScreenPorch"   "PoolArea"     
## [73] "PoolQC"        "Fence"         "MiscFeature"   "MiscVal"      
## [77] "MoSold"        "YrSold"        "SaleType"      "SaleCondition"
## [81] "SalePrice"

# Select columns of housing data to check for correlation
corr_set <- select(train1, LotArea, GrLivArea, YearBuilt, SalePrice)

corr_mat <- cor(corr_set)
corrplot(corr_mat, method="square")

# The highest correlation is GrLivArea vs SalePrice

# Check the t.test at a 92% confidence level
t.test(train1$GrLivArea, train1$SalePrice,conf.level = 0.92)

## 
##  Welch Two Sample t-test
## 
## data:  train1$GrLivArea and train1$SalePrice
## t = -86.288, df = 1459.1, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 92 percent confidence interval:
##  -183048.2 -175763.3
## sample estimates:
##  mean of x  mean of y 
##   1515.464 180921.196

# Welch Two Sample t-test
# 
# data:  train1$GrLivArea and train1$SalePrice
# t = -86.288, df = 1459.1, p-value < 0.00000000000000022
# alternative hypothesis: true difference in means is not equal to 0
# 92 percent confidence interval:
#  -183048.2 -175763.3
# sample estimates:
#  mean of x  mean of y 
#   1515.464 180921.196 

# Conlusion:
# The two variables above show a p-value (< 0.5) which means we can reject the null hypothesis. 
# The sales price of a home can be correlated in part to GrLivArea.


# Check the pearson correlation test to determine if the correlation is 0 with a 92% CI.
cor.test(train1$GrLivArea, train1$SalePrice, method = "pearson" , conf.level = 0.92)

## 
##  Pearson's product-moment correlation
## 
## data:  train1$GrLivArea and train1$SalePrice
## t = 38.348, df = 1458, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
##  0.6850407 0.7307245
## sample estimates:
##       cor 
## 0.7086245

# Pearson's product-moment correlation
# 
# data:  train1$GrLivArea and train1$SalePrice
# t = 38.348, df = 1458, p-value < 0.00000000000000022
# alternative hypothesis: true correlation is not equal to 0
# 92 percent confidence interval:
#  0.6850407 0.7307245
# sample estimates:
#       cor 
# 0.7086245 

# Conclusion:
# This test indicates that there is correlation as the p-value is < 0.5.  The cor value is 0.708 which is good.

# Family wise error:
# Family-wise error rate (FWER) is the probability of making one or more false discoveries, or type I errors when performing multiple hypotheses tests.
# Conclusion: I don't believe family wise error is applicable to this analysis with two tests.  The Pearson product indicates correlation is not equal to zero.  With only these two tests I believe the family wise error risk is low.

** Linear Algebra and Correlation **

Invert your 3 x 3 correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.

# Show 3x3 matrix from correlation section above 
head(corr_mat)

##              LotArea GrLivArea  YearBuilt SalePrice
## LotArea   1.00000000 0.2631162 0.01422765 0.2638434
## GrLivArea 0.26311617 1.0000000 0.19900971 0.7086245
## YearBuilt 0.01422765 0.1990097 1.00000000 0.5228973
## SalePrice 0.26384335 0.7086245 0.52289733 1.0000000

# Solve the correlation matrix to get the precision matrix
corr_pre<-solve(corr_mat)
head(corr_pre)

##              LotArea  GrLivArea  YearBuilt  SalePrice
## LotArea    1.1055865 -0.1134680  0.1614928 -0.2957396
## GrLivArea -0.1134680  2.1981184  0.4996008 -1.7889427
## YearBuilt  0.1614928  0.4996008  1.5217601 -1.1923624
## SalePrice -0.2957396 -1.7889427 -1.1923624  2.9692006

# Multiply the correlation matrix by the precision matrix
corr_mat %*% corr_pre

##                              LotArea GrLivArea                 YearBuilt
## LotArea    0.99999999999999988897770         0  0.0000000000000000000000
## GrLivArea  0.00000000000000000000000         1 -0.0000000000000002220446
## YearBuilt  0.00000000000000002775558         0  1.0000000000000000000000
## SalePrice -0.00000000000000005551115         0 -0.0000000000000002220446
##                           SalePrice
## LotArea    0.0000000000000000000000
## GrLivArea  0.0000000000000000000000
## YearBuilt -0.0000000000000002220446
## SalePrice  1.0000000000000002220446

# Multply the precision matrix by the correlation matrix
corr_pre %*% corr_mat

##                              LotArea                  GrLivArea
## LotArea    0.99999999999999988897770  0.00000000000000008326673
## GrLivArea  0.00000000000000000000000  0.99999999999999977795540
## YearBuilt -0.00000000000000005551115 -0.00000000000000022204460
## SalePrice -0.00000000000000011102230  0.00000000000000000000000
##                           YearBuilt                  SalePrice
## LotArea    0.0000000000000001387779  0.00000000000000005551115
## GrLivArea  0.0000000000000000000000  0.00000000000000000000000
## YearBuilt  1.0000000000000000000000 -0.00000000000000022204460
## SalePrice -0.0000000000000002220446  1.00000000000000022204460

# Conduct LU decomposition on the precision matrix
lu_decomp_pre <- lu(corr_pre)
ex_lu_pre <- expand(lu_decomp_pre)

# Lower decomposition - precision
ex_lu_pre$L

## 4 x 4 Matrix of class "dtrMatrix" (unitriangular)
##      [,1]       [,2]       [,3]       [,4]      
## [1,]  1.0000000          .          .          .
## [2,] -0.1026315  1.0000000          .          .
## [3,]  0.1460698  0.2360766  1.0000000          .
## [4,] -0.2674957 -0.8320683 -0.5228973  1.0000000

#Upper decomposition - precision
ex_lu_pre$U

## 4 x 4 Matrix of class "dtrMatrix"
##      [,1]       [,2]       [,3]       [,4]      
## [1,]  1.1055865 -0.1134680  0.1614928 -0.2957396
## [2,]          .  2.1864730  0.5161751 -1.8192949
## [3,]          .          .  1.3763140 -0.7196709
## [4,]          .          .          .  1.0000000

# Conduct LU decomposition on the correlation matrix
lu_corr <- lu(corr_mat)
ex_lu_corr <- expand(lu_corr)

# Lower decomposition - correlation
ex_lu_corr$L

## 4 x 4 Matrix of class "dtrMatrix" (unitriangular)
##      [,1]       [,2]       [,3]       [,4]      
## [1,] 1.00000000          .          .          .
## [2,] 0.26311617 1.00000000          .          .
## [3,] 0.01422765 0.20978997 1.00000000          .
## [4,] 0.26384335 0.68674657 0.40157692 1.00000000

#Upper decomposition - correlation
ex_lu_corr$U

## 4 x 4 Matrix of class "dtrMatrix"
##      [,1]       [,2]       [,3]       [,4]      
## [1,] 1.00000000 0.26311617 0.01422765 0.26384335
## [2,]          . 0.93076988 0.19526619 0.63920303
## [3,]          .          . 0.95883269 0.38504508
## [4,]          .          .          . 0.33679098

** Calculus-Based Probability & Statistics **

Many times, it makes sense to fit a closed form distribution to data. For the first variable that you selected which is skewed to the right, shift it so that the minimum value is above zero as necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R- devel/library/MASS/html/fitdistr.html ). Find the optimal value of lambda for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, lambda)). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

# Note: X <- train1$GrLivArea

# Check that min val is not 0
min(train1$GrLivArea)

## [1] 334

# [1] 334


# Check the fit distribution
set.seed(4)
X_fit1 <- fitdistr(X, "exponential")

lambda <- X_fit1$estimate
lambda

##        rate 
## 0.000659864

# Histogram of function
hist(X, breaks = 30)

# Check and compare with exponential distribution
rand_sample1 <- rexp(1000, X_fit1$estimate[[1]])
hist(rand_sample1, breaks = 30)

# Conclusion: 
# the fit distribution is a right skewed distibution.
# the exponential distribution follows a similar shape after the peak.  The peak is higher.

# Analyze the Cumulative Distribution Functions:

# Calulate the CDF for 5% and 95% for model
CDF5 <- log(1 - .05)/-lambda
CDF5

##     rate 
## 77.73313

#     rate 
# 77.73313 

CDF95 <- log(1 - .95)/-lambda
CDF95

##     rate 
## 4539.924

#     rate 
# 4539.924

# Check the quantiles 5% and 95% of data
quantile(train1$GrLivArea, 0.05)

##  5% 
## 848

#  5% 
# 848
quantile(train1$GrLivArea, 0.95)

##    95% 
## 2466.1

#    95% 
# 2466.1 

# Calculate Confidence Intervals of data
CI(train1$GrLivArea, 0.95)

##    upper     mean    lower 
## 1542.440 1515.464 1488.487

#    upper     mean    lower 
# 1542.440 1515.464 1488.487 

# Conclusion:
# CDF-Model (.o5,.95) = 77.7 to 4539
# Quantiles-Data (.05,.95) = 848 to 2466.1
# CI-Data mean = 1515
# The Model is ranging from 77.7 to 4539 for (.05,.95)
# The Data is ranging from 848 to 2466.1 for (.05,.95)
# Therefore the Model is not a good fit for the data as it encompasses a much wider data set

** Modeling **

Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.

# Create a model with 4 variables (OverallQual,GrLivArea,LotArea,YearBuilt) versus SalePrice
model1 <- lm(train1$SalePrice ~ train1$OverallQual + train1$GrLivArea + train1$LotArea + train1$YearBuilt, data=train1)
summary(model1)

## 
## Call:
## lm(formula = train1$SalePrice ~ train1$OverallQual + train1$GrLivArea + 
##     train1$LotArea + train1$YearBuilt, data = train1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -422508  -21372   -2462   17591  293526 
## 
## Coefficients:
##                         Estimate    Std. Error t value            Pr(>|t|)
## (Intercept)        -1059908.4170    81812.9728 -12.955 <0.0000000000000002
## train1$OverallQual    25687.0280     1145.9429  22.416 <0.0000000000000002
## train1$GrLivArea         56.7259        2.5991  21.825 <0.0000000000000002
## train1$LotArea            0.9160        0.1084   8.447 <0.0000000000000002
## train1$YearBuilt        501.4827       43.0583  11.647 <0.0000000000000002
##                       
## (Intercept)        ***
## train1$OverallQual ***
## train1$GrLivArea   ***
## train1$LotArea     ***
## train1$YearBuilt   ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39800 on 1455 degrees of freedom
## Multiple R-squared:  0.7497, Adjusted R-squared:  0.749 
## F-statistic:  1089 on 4 and 1455 DF,  p-value: < 0.00000000000000022

par(mfrow=c(2,2))
a1<-train1$OverallQual
a2<-train1$GrLivArea
a3<-train1$LotArea
a4<-train1$YearBuilt
b<-train1$SalePrice

# Plot the input variables
plot(a1,b, col="chocolate", main="OverallQual", ylab="Sale Price")
abline(lm(b~a1), col="green", lwd=3) # regression line (y~x)

plot(a2,b, col="blue", main="GrLivArea", ylab="Sale Price")
abline(lm(b~a2), col="green", lwd=3) # regression line (y~x)

plot(a3,b, col="black", main="LotArea", ylab="Sale Price")
abline(lm(b~a3), col="green", lwd=3) # regression line (y~x)

plot(a4,b, col="darkmagenta", main="YearBuilt", ylab="Sale Price")
abline(lm(b~a4), col="green", lwd=3) # regression line (y~x)

# Load test data

test1 <- read.csv('https://raw.githubusercontent.com/danthonn/DATA_605_Final_DThonn/master/test.csv')

# Model Equation
SalePrice<-((25687.0280*test1$OverallQual) + (56.7259*test1$GrLivArea) +  (0.9160*test1$LotArea) + (501.4827*test1$YearBuilt) -1059908.4170)

# Cleanup test data
test2 <- select_if(test1, is.numeric)
test2[is.na(test2)] <- 0

# Run a prediction
predict1<-predict(model1,test2)

## Warning: 'newdata' had 1459 rows but variables found have 1460 rows

str(predict1)

##  Named num [1:1460] 229112 165525 234985 186386 286299 ...
##  - attr(*, "names")= chr [1:1460] "1" "2" "3" "4" ...

test3<-test2[,c("Id","OverallQual","GrLivArea","LotArea","YearBuilt")]
head(test3)

##     Id OverallQual GrLivArea LotArea YearBuilt
## 1 1461           5       896   11622      1961
## 2 1462           6      1329   14267      1958
## 3 1463           5      1629   13830      1997
## 4 1464           6      1604    9978      1998
## 5 1465           8      1280    5005      1992
## 6 1466           6      1655   10000      1993

# Preparation for Kaggle submission
submission_dt1 <- as.data.frame(cbind(test3$Id, predict1))

## Warning in cbind(test3$Id, predict1): number of rows of result is not a
## multiple of vector length (arg 1)

colnames(submission_dt1) <- c("Id", "SalePrice")
submission_dt2<-as.data.frame(submission_dt1[1:1459,])
#head(submission_dt1)
#str(submission_dt1)

write.csv(submission_dt2, file = 'C:/Users/Dan/Documents/GitHub/DATA_605_Final_DThonn/submission_dt2.csv', quote=FALSE, row.names=FALSE)

#knitr::include_graphics('https://raw.githubusercontent.com/danthonn/DATA_605_Final_DThonn/master/Kaggle_Score_submission_dt2_Dthonn.PNG')


# Analysis and Conclusion:
# The top four attributes were selected with higher correlations.
# The linear regression model was used to predict the score.
# Kaggle.com user name: dtseattle
# Kaggle.com user score: 0.74802
# Pr values of attributes are significant as they are less than .05.
# Adjusted R squared value shows significant correlation at .749.

** END **