library(bitops)
library(RCurl)
library(xgboost)
## Warning: package 'xgboost' was built under R version 3.3.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.3.3
## corrplot 0.84 loaded
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.3
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:xgboost':
## 
##     slice
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
url <- getURL("https://raw.githubusercontent.com/excelsiordata/DATA605/master/train.csv")
train <- read.csv(text = url, head = TRUE, sep = ",", stringsAsFactors = FALSE)
head(train)
##   Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1  1         60       RL          65    8450   Pave  <NA>      Reg
## 2  2         20       RL          80    9600   Pave  <NA>      Reg
## 3  3         60       RL          68   11250   Pave  <NA>      IR1
## 4  4         70       RL          60    9550   Pave  <NA>      IR1
## 5  5         60       RL          84   14260   Pave  <NA>      IR1
## 6  6         50       RL          85   14115   Pave  <NA>      IR1
##   LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1         Lvl    AllPub    Inside       Gtl      CollgCr       Norm
## 2         Lvl    AllPub       FR2       Gtl      Veenker      Feedr
## 3         Lvl    AllPub    Inside       Gtl      CollgCr       Norm
## 4         Lvl    AllPub    Corner       Gtl      Crawfor       Norm
## 5         Lvl    AllPub       FR2       Gtl      NoRidge       Norm
## 6         Lvl    AllPub    Inside       Gtl      Mitchel       Norm
##   Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1       Norm     1Fam     2Story           7           5      2003
## 2       Norm     1Fam     1Story           6           8      1976
## 3       Norm     1Fam     2Story           7           5      2001
## 4       Norm     1Fam     2Story           7           5      1915
## 5       Norm     1Fam     2Story           8           5      2000
## 6       Norm     1Fam     1.5Fin           5           5      1993
##   YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1         2003     Gable  CompShg     VinylSd     VinylSd    BrkFace
## 2         1976     Gable  CompShg     MetalSd     MetalSd       None
## 3         2002     Gable  CompShg     VinylSd     VinylSd    BrkFace
## 4         1970     Gable  CompShg     Wd Sdng     Wd Shng       None
## 5         2000     Gable  CompShg     VinylSd     VinylSd    BrkFace
## 6         1995     Gable  CompShg     VinylSd     VinylSd       None
##   MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1        196        Gd        TA      PConc       Gd       TA           No
## 2          0        TA        TA     CBlock       Gd       TA           Gd
## 3        162        Gd        TA      PConc       Gd       TA           Mn
## 4          0        TA        TA     BrkTil       TA       Gd           No
## 5        350        Gd        TA      PConc       Gd       TA           Av
## 6          0        TA        TA       Wood       Gd       TA           No
##   BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1          GLQ        706          Unf          0       150         856
## 2          ALQ        978          Unf          0       284        1262
## 3          GLQ        486          Unf          0       434         920
## 4          ALQ        216          Unf          0       540         756
## 5          GLQ        655          Unf          0       490        1145
## 6          GLQ        732          Unf          0        64         796
##   Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 1    GasA        Ex          Y      SBrkr       856       854            0
## 2    GasA        Ex          Y      SBrkr      1262         0            0
## 3    GasA        Ex          Y      SBrkr       920       866            0
## 4    GasA        Gd          Y      SBrkr       961       756            0
## 5    GasA        Ex          Y      SBrkr      1145      1053            0
## 6    GasA        Ex          Y      SBrkr       796       566            0
##   GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1      1710            1            0        2        1            3
## 2      1262            0            1        2        0            3
## 3      1786            1            0        2        1            3
## 4      1717            1            0        1        0            3
## 5      2198            1            0        2        1            4
## 6      1362            1            0        1        1            1
##   KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1            1          Gd            8        Typ          0        <NA>
## 2            1          TA            6        Typ          1          TA
## 3            1          Gd            6        Typ          1          TA
## 4            1          Gd            7        Typ          1          Gd
## 5            1          Gd            9        Typ          1          TA
## 6            1          TA            5        Typ          0        <NA>
##   GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1     Attchd        2003          RFn          2        548         TA
## 2     Attchd        1976          RFn          2        460         TA
## 3     Attchd        2001          RFn          2        608         TA
## 4     Detchd        1998          Unf          3        642         TA
## 5     Attchd        2000          RFn          3        836         TA
## 6     Attchd        1993          Unf          2        480         TA
##   GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 1         TA          Y          0          61             0          0
## 2         TA          Y        298           0             0          0
## 3         TA          Y          0          42             0          0
## 4         TA          Y          0          35           272          0
## 5         TA          Y        192          84             0          0
## 6         TA          Y         40          30             0        320
##   ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1           0        0   <NA>  <NA>        <NA>       0      2   2008
## 2           0        0   <NA>  <NA>        <NA>       0      5   2007
## 3           0        0   <NA>  <NA>        <NA>       0      9   2008
## 4           0        0   <NA>  <NA>        <NA>       0      2   2006
## 5           0        0   <NA>  <NA>        <NA>       0     12   2008
## 6           0        0   <NA> MnPrv        Shed     700     10   2009
##   SaleType SaleCondition SalePrice
## 1       WD        Normal    208500
## 2       WD        Normal    181500
## 3       WD        Normal    223500
## 4       WD       Abnorml    140000
## 5       WD        Normal    250000
## 6       WD        Normal    143000
dim(train)
## [1] 1460   81
class(train)
## [1] "data.frame"
X <- train$OverallQual
Y <- train$SalePrice
x <- quantile(X, 0.25)
y <- quantile(Y, 0.5)

nums <- train %>% select_if(is.numeric) %>% select(-SalePrice)

numscorr <- cor(nums, train$SalePrice)
corrplot(numscorr, method = "color")

head(numscorr)
##                    [,1]
## Id          -0.02191672
## MSSubClass  -0.08428414
## LotFrontage          NA
## LotArea      0.26384335
## OverallQual  0.79098160
## OverallCond -0.07785589
y <- getURL("https://raw.githubusercontent.com/excelsiordata/DATA605/master/test.csv")
test <- read.csv(text = y, head = TRUE, sep = ",", stringsAsFactors = FALSE)
head(test)
##     Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1 1461         20       RH          80   11622   Pave  <NA>      Reg
## 2 1462         20       RL          81   14267   Pave  <NA>      IR1
## 3 1463         60       RL          74   13830   Pave  <NA>      IR1
## 4 1464         60       RL          78    9978   Pave  <NA>      IR1
## 5 1465        120       RL          43    5005   Pave  <NA>      IR1
## 6 1466         60       RL          75   10000   Pave  <NA>      IR1
##   LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1         Lvl    AllPub    Inside       Gtl        NAmes      Feedr
## 2         Lvl    AllPub    Corner       Gtl        NAmes       Norm
## 3         Lvl    AllPub    Inside       Gtl      Gilbert       Norm
## 4         Lvl    AllPub    Inside       Gtl      Gilbert       Norm
## 5         HLS    AllPub    Inside       Gtl      StoneBr       Norm
## 6         Lvl    AllPub    Corner       Gtl      Gilbert       Norm
##   Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1       Norm     1Fam     1Story           5           6      1961
## 2       Norm     1Fam     1Story           6           6      1958
## 3       Norm     1Fam     2Story           5           5      1997
## 4       Norm     1Fam     2Story           6           6      1998
## 5       Norm   TwnhsE     1Story           8           5      1992
## 6       Norm     1Fam     2Story           6           5      1993
##   YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1         1961     Gable  CompShg     VinylSd     VinylSd       None
## 2         1958       Hip  CompShg     Wd Sdng     Wd Sdng    BrkFace
## 3         1998     Gable  CompShg     VinylSd     VinylSd       None
## 4         1998     Gable  CompShg     VinylSd     VinylSd    BrkFace
## 5         1992     Gable  CompShg     HdBoard     HdBoard       None
## 6         1994     Gable  CompShg     HdBoard     HdBoard       None
##   MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1          0        TA        TA     CBlock       TA       TA           No
## 2        108        TA        TA     CBlock       TA       TA           No
## 3          0        TA        TA      PConc       Gd       TA           No
## 4         20        TA        TA      PConc       TA       TA           No
## 5          0        Gd        TA      PConc       Gd       TA           No
## 6          0        TA        TA      PConc       Gd       TA           No
##   BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1          Rec        468          LwQ        144       270         882
## 2          ALQ        923          Unf          0       406        1329
## 3          GLQ        791          Unf          0       137         928
## 4          GLQ        602          Unf          0       324         926
## 5          ALQ        263          Unf          0      1017        1280
## 6          Unf          0          Unf          0       763         763
##   Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 1    GasA        TA          Y      SBrkr       896         0            0
## 2    GasA        TA          Y      SBrkr      1329         0            0
## 3    GasA        Gd          Y      SBrkr       928       701            0
## 4    GasA        Ex          Y      SBrkr       926       678            0
## 5    GasA        Ex          Y      SBrkr      1280         0            0
## 6    GasA        Gd          Y      SBrkr       763       892            0
##   GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1       896            0            0        1        0            2
## 2      1329            0            0        1        1            3
## 3      1629            0            0        2        1            3
## 4      1604            0            0        2        1            3
## 5      1280            0            0        2        0            2
## 6      1655            0            0        2        1            3
##   KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1            1          TA            5        Typ          0        <NA>
## 2            1          Gd            6        Typ          0        <NA>
## 3            1          TA            6        Typ          1          TA
## 4            1          Gd            7        Typ          1          Gd
## 5            1          Gd            5        Typ          0        <NA>
## 6            1          TA            7        Typ          1          TA
##   GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1     Attchd        1961          Unf          1        730         TA
## 2     Attchd        1958          Unf          1        312         TA
## 3     Attchd        1997          Fin          2        482         TA
## 4     Attchd        1998          Fin          2        470         TA
## 5     Attchd        1992          RFn          2        506         TA
## 6     Attchd        1993          Fin          2        440         TA
##   GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 1         TA          Y        140           0             0          0
## 2         TA          Y        393          36             0          0
## 3         TA          Y        212          34             0          0
## 4         TA          Y        360          36             0          0
## 5         TA          Y          0          82             0          0
## 6         TA          Y        157          84             0          0
##   ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1         120        0   <NA> MnPrv        <NA>       0      6   2010
## 2           0        0   <NA>  <NA>        Gar2   12500      6   2010
## 3           0        0   <NA> MnPrv        <NA>       0      3   2010
## 4           0        0   <NA>  <NA>        <NA>       0      6   2010
## 5         144        0   <NA>  <NA>        <NA>       0      1   2010
## 6           0        0   <NA>  <NA>        <NA>       0      4   2010
##   SaleType SaleCondition
## 1       WD        Normal
## 2       WD        Normal
## 3       WD        Normal
## 4       WD        Normal
## 5       WD        Normal
## 6       WD        Normal
dim(test)
## [1] 1459   80
class(test)
## [1] "data.frame"

\[P(X>x | Y>y)\] ##Probability

pXx <- length(x)

#calculate how many rows are above the first quartile
pXx1 <- train %>% filter(OverallQual > quantile(OverallQual, 0.25))
nrow(pXx1)
## [1] 922
pXx <- nrow(pXx1)/length(X)
pXx
## [1] 0.6315068
#calculate how many rows are above the 2nd quartile
pYy1 <- train %>% filter(SalePrice > quantile(SalePrice, 0.5))
nrow(pYy1)
## [1] 728
pYy <- nrow(pYy1)/length(Y)
pYy
## [1] 0.4986301
XxgivenYy <- (sum(Y > y & X > x) / length(X)) / (sum(Y > y) / length(Y))
XxgivenYy
## [1] NaN

The probability that X is greater than x given that Y is greater than y is 0.9313187 or 93.13%.

\[P(X>x , Y>y)\]

XxandYy <- sum(X > x & Y > y) / length(X)
XxandYy
## [1] 0

The probability that X is greater than x and Y is greater than y is 0.4643836 or ~46.43%.

\[P(X<x | Y>y)\]

XlessxgivenYgreaty <- (sum(Y > y & X < x) / length(X)) / (sum(Y > y) / length(Y))
XlessxgivenYgreaty
## [1] NaN

The probability that X is less than x given that Y is greater than y is 0.004120879 or 0.41%.

xy.con.table <- table(X > x, Y > y)
chisq.test(xy.con.table)
## 
##  Chi-squared test for given probabilities
## 
## data:  xy.con.table
## X-squared = 101, df = 1, p-value < 2.2e-16

\(P(A|B) \neq P(A)P(B)\). This indicates that our variables, Overall Quality and Sales Price are not independent.

The results from our Chi Squared test confirm there is a relationship between Overall Quality and Sales Price.

Descriptive and Inferential Statistics

library(psych)
## Warning: package 'psych' was built under R version 3.3.3
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
plot(train$SalePrice ~ train$OverallQual)

boxplot(train$SalePrice ~ train$OverallQual)

by(train$SalePrice, train$OverallQual, mean)
## train$OverallQual: 1
## [1] 50150
## -------------------------------------------------------- 
## train$OverallQual: 2
## [1] 51770.33
## -------------------------------------------------------- 
## train$OverallQual: 3
## [1] 87473.75
## -------------------------------------------------------- 
## train$OverallQual: 4
## [1] 108420.7
## -------------------------------------------------------- 
## train$OverallQual: 5
## [1] 133523.3
## -------------------------------------------------------- 
## train$OverallQual: 6
## [1] 161603
## -------------------------------------------------------- 
## train$OverallQual: 7
## [1] 207716.4
## -------------------------------------------------------- 
## train$OverallQual: 8
## [1] 274735.5
## -------------------------------------------------------- 
## train$OverallQual: 9
## [1] 367513
## -------------------------------------------------------- 
## train$OverallQual: 10
## [1] 438588.4
by(train$SalePrice, train$OverallQual, length)
## train$OverallQual: 1
## [1] 2
## -------------------------------------------------------- 
## train$OverallQual: 2
## [1] 3
## -------------------------------------------------------- 
## train$OverallQual: 3
## [1] 20
## -------------------------------------------------------- 
## train$OverallQual: 4
## [1] 116
## -------------------------------------------------------- 
## train$OverallQual: 5
## [1] 397
## -------------------------------------------------------- 
## train$OverallQual: 6
## [1] 374
## -------------------------------------------------------- 
## train$OverallQual: 7
## [1] 319
## -------------------------------------------------------- 
## train$OverallQual: 8
## [1] 168
## -------------------------------------------------------- 
## train$OverallQual: 9
## [1] 43
## -------------------------------------------------------- 
## train$OverallQual: 10
## [1] 18
hist(train$SalePrice, xlab = "Sale Price", main = "Histogram of Sale Price")

plot(train$OverallQual)

describe(train$OverallQual)
##    vars    n mean   sd median trimmed  mad min max range skew kurtosis
## X1    1 1460  6.1 1.38      6    6.08 1.48   1  10     9 0.22     0.09
##      se
## X1 0.04
describe(train$SalePrice)
##    vars    n     mean      sd median  trimmed     mad   min    max  range
## X1    1 1460 180921.2 79442.5 163000 170783.3 56338.8 34900 755000 720100
##    skew kurtosis      se
## X1 1.88      6.5 2079.11
bc <- boxcox(train$SalePrice ~ train$OverallQual)

plot(bc)

Linear Algebra and Correlation

numscorr <- cor(nums)
allcorr <- corrplot(numscorr, type = "full", method = "circle", sig.level = 0.01, insig = "blank")

numscorr3 <- nums %>% 
  dplyr::select(TotalBsmtSF,X1stFlrSF,Fireplaces) %>% 
  cor()

invnumscorr3<-solve(numscorr3)

numscorr3 %*% invnumscorr3
##              TotalBsmtSF     X1stFlrSF    Fireplaces
## TotalBsmtSF 1.000000e+00 -5.551115e-17  0.000000e+00
## X1stFlrSF   4.943962e-17  1.000000e+00 -5.551115e-17
## Fireplaces  2.619432e-16 -1.110223e-16  1.000000e+00
invnumscorr3 %*% numscorr3
##               TotalBsmtSF     X1stFlrSF    Fireplaces
## TotalBsmtSF  1.000000e+00  5.551115e-17  2.775558e-16
## X1stFlrSF   -5.551115e-17  1.000000e+00 -1.110223e-16
## Fireplaces   0.000000e+00 -5.551115e-17  1.000000e+00

Calculus-Based Probability & Statistics

fitdistr(train$OverallQual, densfun = "log-normal")
##      meanlog        sdlog   
##   1.780784542   0.241275908 
##  (0.006314479) (0.004465011)
hist(train$OverallQual, main = "Full Overall Quality Population", xlab = "Overall Quality Rating")

OQ1000 <- sample(train$OverallQual, 1000)
fitdistr(OQ1000, densfun = "log-normal")
##      meanlog        sdlog   
##   1.785673403   0.232424204 
##  (0.007349899) (0.005197163)
hist(OQ1000, main = "Subset of the Overall Quality Population", xlab = "Overall Quality Rating")

Modeling

library(randomForest)
## Warning: package 'randomForest' was built under R version 3.3.3
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:psych':
## 
##     outlier
## The following object is masked from 'package:dplyr':
## 
##     combine
library(ModelMetrics)
## Warning: package 'ModelMetrics' was built under R version 3.3.3
train <- read.csv(text = url, head = TRUE, sep = ",", stringsAsFactors = FALSE)

train1 <- train %>% select_if(is.numeric) %>% dplyr::select(-Id)
train1[is.na(train1)] <- -1
fit <- randomForest(SalePrice ~ .,data=train1, 
                    ntree=800, verbose=T)
summary(fit)
##                 Length Class  Mode     
## call               5   -none- call     
## type               1   -none- character
## predicted       1460   -none- numeric  
## mse              800   -none- numeric  
## rsq              800   -none- numeric  
## oob.times       1460   -none- numeric  
## importance        36   -none- numeric  
## importanceSD       0   -none- NULL     
## localImportance    0   -none- NULL     
## proximity          0   -none- NULL     
## ntree              1   -none- numeric  
## mtry               1   -none- numeric  
## forest            11   -none- list     
## coefs              0   -none- NULL     
## y               1460   -none- numeric  
## test               0   -none- NULL     
## inbag              0   -none- NULL     
## terms              3   terms  call
prd <- predict(fit,train1)

test1 <- test

test1[is.na(test1)] <- -1

testprd <- (predict(fit,test1))

testprd.df <- data.frame(testprd)

testprd.df$Id <- 1461:2919
names(testprd.df)[names(testprd.df)=="testprd"] <- "SalePrice"
testprd.df <- testprd.df[c(2, 1)]

head(testprd.df)
##     Id SalePrice
## 1 1461  127633.5
## 2 1462  154264.9
## 3 1463  182640.7
## 4 1464  184318.0
## 5 1465  192584.3
## 6 1466  185055.2
rmse(train1$SalePrice,prd)
## [1] 12181.35
rmse(X,testprd)
## [1] 192959.7
varImpPlot(fit)

write.csv(testprd.df, file = "KLS_Results.csv")