library(bitops)
library(RCurl)
library(xgboost)
## Warning: package 'xgboost' was built under R version 3.3.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.3.3
## corrplot 0.84 loaded
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.3
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:xgboost':
##
## slice
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
url <- getURL("https://raw.githubusercontent.com/excelsiordata/DATA605/master/train.csv")
train <- read.csv(text = url, head = TRUE, sep = ",", stringsAsFactors = FALSE)
head(train)
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1 1 60 RL 65 8450 Pave <NA> Reg
## 2 2 20 RL 80 9600 Pave <NA> Reg
## 3 3 60 RL 68 11250 Pave <NA> IR1
## 4 4 70 RL 60 9550 Pave <NA> IR1
## 5 5 60 RL 84 14260 Pave <NA> IR1
## 6 6 50 RL 85 14115 Pave <NA> IR1
## LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1 Lvl AllPub Inside Gtl CollgCr Norm
## 2 Lvl AllPub FR2 Gtl Veenker Feedr
## 3 Lvl AllPub Inside Gtl CollgCr Norm
## 4 Lvl AllPub Corner Gtl Crawfor Norm
## 5 Lvl AllPub FR2 Gtl NoRidge Norm
## 6 Lvl AllPub Inside Gtl Mitchel Norm
## Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1 Norm 1Fam 2Story 7 5 2003
## 2 Norm 1Fam 1Story 6 8 1976
## 3 Norm 1Fam 2Story 7 5 2001
## 4 Norm 1Fam 2Story 7 5 1915
## 5 Norm 1Fam 2Story 8 5 2000
## 6 Norm 1Fam 1.5Fin 5 5 1993
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1 2003 Gable CompShg VinylSd VinylSd BrkFace
## 2 1976 Gable CompShg MetalSd MetalSd None
## 3 2002 Gable CompShg VinylSd VinylSd BrkFace
## 4 1970 Gable CompShg Wd Sdng Wd Shng None
## 5 2000 Gable CompShg VinylSd VinylSd BrkFace
## 6 1995 Gable CompShg VinylSd VinylSd None
## MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1 196 Gd TA PConc Gd TA No
## 2 0 TA TA CBlock Gd TA Gd
## 3 162 Gd TA PConc Gd TA Mn
## 4 0 TA TA BrkTil TA Gd No
## 5 350 Gd TA PConc Gd TA Av
## 6 0 TA TA Wood Gd TA No
## BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1 GLQ 706 Unf 0 150 856
## 2 ALQ 978 Unf 0 284 1262
## 3 GLQ 486 Unf 0 434 920
## 4 ALQ 216 Unf 0 540 756
## 5 GLQ 655 Unf 0 490 1145
## 6 GLQ 732 Unf 0 64 796
## Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 1 GasA Ex Y SBrkr 856 854 0
## 2 GasA Ex Y SBrkr 1262 0 0
## 3 GasA Ex Y SBrkr 920 866 0
## 4 GasA Gd Y SBrkr 961 756 0
## 5 GasA Ex Y SBrkr 1145 1053 0
## 6 GasA Ex Y SBrkr 796 566 0
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1 1710 1 0 2 1 3
## 2 1262 0 1 2 0 3
## 3 1786 1 0 2 1 3
## 4 1717 1 0 1 0 3
## 5 2198 1 0 2 1 4
## 6 1362 1 0 1 1 1
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1 1 Gd 8 Typ 0 <NA>
## 2 1 TA 6 Typ 1 TA
## 3 1 Gd 6 Typ 1 TA
## 4 1 Gd 7 Typ 1 Gd
## 5 1 Gd 9 Typ 1 TA
## 6 1 TA 5 Typ 0 <NA>
## GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1 Attchd 2003 RFn 2 548 TA
## 2 Attchd 1976 RFn 2 460 TA
## 3 Attchd 2001 RFn 2 608 TA
## 4 Detchd 1998 Unf 3 642 TA
## 5 Attchd 2000 RFn 3 836 TA
## 6 Attchd 1993 Unf 2 480 TA
## GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 1 TA Y 0 61 0 0
## 2 TA Y 298 0 0 0
## 3 TA Y 0 42 0 0
## 4 TA Y 0 35 272 0
## 5 TA Y 192 84 0 0
## 6 TA Y 40 30 0 320
## ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1 0 0 <NA> <NA> <NA> 0 2 2008
## 2 0 0 <NA> <NA> <NA> 0 5 2007
## 3 0 0 <NA> <NA> <NA> 0 9 2008
## 4 0 0 <NA> <NA> <NA> 0 2 2006
## 5 0 0 <NA> <NA> <NA> 0 12 2008
## 6 0 0 <NA> MnPrv Shed 700 10 2009
## SaleType SaleCondition SalePrice
## 1 WD Normal 208500
## 2 WD Normal 181500
## 3 WD Normal 223500
## 4 WD Abnorml 140000
## 5 WD Normal 250000
## 6 WD Normal 143000
dim(train)
## [1] 1460 81
class(train)
## [1] "data.frame"
X <- train$OverallQual
Y <- train$SalePrice
x <- quantile(X, 0.25)
y <- quantile(Y, 0.5)
nums <- train %>% select_if(is.numeric) %>% select(-SalePrice)
numscorr <- cor(nums, train$SalePrice)
corrplot(numscorr, method = "color")
head(numscorr)
## [,1]
## Id -0.02191672
## MSSubClass -0.08428414
## LotFrontage NA
## LotArea 0.26384335
## OverallQual 0.79098160
## OverallCond -0.07785589
y <- getURL("https://raw.githubusercontent.com/excelsiordata/DATA605/master/test.csv")
test <- read.csv(text = y, head = TRUE, sep = ",", stringsAsFactors = FALSE)
head(test)
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1 1461 20 RH 80 11622 Pave <NA> Reg
## 2 1462 20 RL 81 14267 Pave <NA> IR1
## 3 1463 60 RL 74 13830 Pave <NA> IR1
## 4 1464 60 RL 78 9978 Pave <NA> IR1
## 5 1465 120 RL 43 5005 Pave <NA> IR1
## 6 1466 60 RL 75 10000 Pave <NA> IR1
## LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1 Lvl AllPub Inside Gtl NAmes Feedr
## 2 Lvl AllPub Corner Gtl NAmes Norm
## 3 Lvl AllPub Inside Gtl Gilbert Norm
## 4 Lvl AllPub Inside Gtl Gilbert Norm
## 5 HLS AllPub Inside Gtl StoneBr Norm
## 6 Lvl AllPub Corner Gtl Gilbert Norm
## Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1 Norm 1Fam 1Story 5 6 1961
## 2 Norm 1Fam 1Story 6 6 1958
## 3 Norm 1Fam 2Story 5 5 1997
## 4 Norm 1Fam 2Story 6 6 1998
## 5 Norm TwnhsE 1Story 8 5 1992
## 6 Norm 1Fam 2Story 6 5 1993
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1 1961 Gable CompShg VinylSd VinylSd None
## 2 1958 Hip CompShg Wd Sdng Wd Sdng BrkFace
## 3 1998 Gable CompShg VinylSd VinylSd None
## 4 1998 Gable CompShg VinylSd VinylSd BrkFace
## 5 1992 Gable CompShg HdBoard HdBoard None
## 6 1994 Gable CompShg HdBoard HdBoard None
## MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1 0 TA TA CBlock TA TA No
## 2 108 TA TA CBlock TA TA No
## 3 0 TA TA PConc Gd TA No
## 4 20 TA TA PConc TA TA No
## 5 0 Gd TA PConc Gd TA No
## 6 0 TA TA PConc Gd TA No
## BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1 Rec 468 LwQ 144 270 882
## 2 ALQ 923 Unf 0 406 1329
## 3 GLQ 791 Unf 0 137 928
## 4 GLQ 602 Unf 0 324 926
## 5 ALQ 263 Unf 0 1017 1280
## 6 Unf 0 Unf 0 763 763
## Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 1 GasA TA Y SBrkr 896 0 0
## 2 GasA TA Y SBrkr 1329 0 0
## 3 GasA Gd Y SBrkr 928 701 0
## 4 GasA Ex Y SBrkr 926 678 0
## 5 GasA Ex Y SBrkr 1280 0 0
## 6 GasA Gd Y SBrkr 763 892 0
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1 896 0 0 1 0 2
## 2 1329 0 0 1 1 3
## 3 1629 0 0 2 1 3
## 4 1604 0 0 2 1 3
## 5 1280 0 0 2 0 2
## 6 1655 0 0 2 1 3
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1 1 TA 5 Typ 0 <NA>
## 2 1 Gd 6 Typ 0 <NA>
## 3 1 TA 6 Typ 1 TA
## 4 1 Gd 7 Typ 1 Gd
## 5 1 Gd 5 Typ 0 <NA>
## 6 1 TA 7 Typ 1 TA
## GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1 Attchd 1961 Unf 1 730 TA
## 2 Attchd 1958 Unf 1 312 TA
## 3 Attchd 1997 Fin 2 482 TA
## 4 Attchd 1998 Fin 2 470 TA
## 5 Attchd 1992 RFn 2 506 TA
## 6 Attchd 1993 Fin 2 440 TA
## GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 1 TA Y 140 0 0 0
## 2 TA Y 393 36 0 0
## 3 TA Y 212 34 0 0
## 4 TA Y 360 36 0 0
## 5 TA Y 0 82 0 0
## 6 TA Y 157 84 0 0
## ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1 120 0 <NA> MnPrv <NA> 0 6 2010
## 2 0 0 <NA> <NA> Gar2 12500 6 2010
## 3 0 0 <NA> MnPrv <NA> 0 3 2010
## 4 0 0 <NA> <NA> <NA> 0 6 2010
## 5 144 0 <NA> <NA> <NA> 0 1 2010
## 6 0 0 <NA> <NA> <NA> 0 4 2010
## SaleType SaleCondition
## 1 WD Normal
## 2 WD Normal
## 3 WD Normal
## 4 WD Normal
## 5 WD Normal
## 6 WD Normal
dim(test)
## [1] 1459 80
class(test)
## [1] "data.frame"
\[P(X>x | Y>y)\] ##Probability
pXx <- length(x)
#calculate how many rows are above the first quartile
pXx1 <- train %>% filter(OverallQual > quantile(OverallQual, 0.25))
nrow(pXx1)
## [1] 922
pXx <- nrow(pXx1)/length(X)
pXx
## [1] 0.6315068
#calculate how many rows are above the 2nd quartile
pYy1 <- train %>% filter(SalePrice > quantile(SalePrice, 0.5))
nrow(pYy1)
## [1] 728
pYy <- nrow(pYy1)/length(Y)
pYy
## [1] 0.4986301
XxgivenYy <- (sum(Y > y & X > x) / length(X)) / (sum(Y > y) / length(Y))
XxgivenYy
## [1] NaN
The probability that X is greater than x given that Y is greater than y is 0.9313187 or 93.13%.
\[P(X>x , Y>y)\]
XxandYy <- sum(X > x & Y > y) / length(X)
XxandYy
## [1] 0
The probability that X is greater than x and Y is greater than y is 0.4643836 or ~46.43%.
\[P(X<x | Y>y)\]
XlessxgivenYgreaty <- (sum(Y > y & X < x) / length(X)) / (sum(Y > y) / length(Y))
XlessxgivenYgreaty
## [1] NaN
The probability that X is less than x given that Y is greater than y is 0.004120879 or 0.41%.
xy.con.table <- table(X > x, Y > y)
chisq.test(xy.con.table)
##
## Chi-squared test for given probabilities
##
## data: xy.con.table
## X-squared = 101, df = 1, p-value < 2.2e-16
\(P(A|B) \neq P(A)P(B)\). This indicates that our variables, Overall Quality and Sales Price are not independent.
The results from our Chi Squared test confirm there is a relationship between Overall Quality and Sales Price.
library(psych)
## Warning: package 'psych' was built under R version 3.3.3
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
plot(train$SalePrice ~ train$OverallQual)
boxplot(train$SalePrice ~ train$OverallQual)
by(train$SalePrice, train$OverallQual, mean)
## train$OverallQual: 1
## [1] 50150
## --------------------------------------------------------
## train$OverallQual: 2
## [1] 51770.33
## --------------------------------------------------------
## train$OverallQual: 3
## [1] 87473.75
## --------------------------------------------------------
## train$OverallQual: 4
## [1] 108420.7
## --------------------------------------------------------
## train$OverallQual: 5
## [1] 133523.3
## --------------------------------------------------------
## train$OverallQual: 6
## [1] 161603
## --------------------------------------------------------
## train$OverallQual: 7
## [1] 207716.4
## --------------------------------------------------------
## train$OverallQual: 8
## [1] 274735.5
## --------------------------------------------------------
## train$OverallQual: 9
## [1] 367513
## --------------------------------------------------------
## train$OverallQual: 10
## [1] 438588.4
by(train$SalePrice, train$OverallQual, length)
## train$OverallQual: 1
## [1] 2
## --------------------------------------------------------
## train$OverallQual: 2
## [1] 3
## --------------------------------------------------------
## train$OverallQual: 3
## [1] 20
## --------------------------------------------------------
## train$OverallQual: 4
## [1] 116
## --------------------------------------------------------
## train$OverallQual: 5
## [1] 397
## --------------------------------------------------------
## train$OverallQual: 6
## [1] 374
## --------------------------------------------------------
## train$OverallQual: 7
## [1] 319
## --------------------------------------------------------
## train$OverallQual: 8
## [1] 168
## --------------------------------------------------------
## train$OverallQual: 9
## [1] 43
## --------------------------------------------------------
## train$OverallQual: 10
## [1] 18
hist(train$SalePrice, xlab = "Sale Price", main = "Histogram of Sale Price")
plot(train$OverallQual)
describe(train$OverallQual)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 1460 6.1 1.38 6 6.08 1.48 1 10 9 0.22 0.09
## se
## X1 0.04
describe(train$SalePrice)
## vars n mean sd median trimmed mad min max range
## X1 1 1460 180921.2 79442.5 163000 170783.3 56338.8 34900 755000 720100
## skew kurtosis se
## X1 1.88 6.5 2079.11
bc <- boxcox(train$SalePrice ~ train$OverallQual)
plot(bc)
numscorr <- cor(nums)
allcorr <- corrplot(numscorr, type = "full", method = "circle", sig.level = 0.01, insig = "blank")
numscorr3 <- nums %>%
dplyr::select(TotalBsmtSF,X1stFlrSF,Fireplaces) %>%
cor()
invnumscorr3<-solve(numscorr3)
numscorr3 %*% invnumscorr3
## TotalBsmtSF X1stFlrSF Fireplaces
## TotalBsmtSF 1.000000e+00 -5.551115e-17 0.000000e+00
## X1stFlrSF 4.943962e-17 1.000000e+00 -5.551115e-17
## Fireplaces 2.619432e-16 -1.110223e-16 1.000000e+00
invnumscorr3 %*% numscorr3
## TotalBsmtSF X1stFlrSF Fireplaces
## TotalBsmtSF 1.000000e+00 5.551115e-17 2.775558e-16
## X1stFlrSF -5.551115e-17 1.000000e+00 -1.110223e-16
## Fireplaces 0.000000e+00 -5.551115e-17 1.000000e+00
fitdistr(train$OverallQual, densfun = "log-normal")
## meanlog sdlog
## 1.780784542 0.241275908
## (0.006314479) (0.004465011)
hist(train$OverallQual, main = "Full Overall Quality Population", xlab = "Overall Quality Rating")
OQ1000 <- sample(train$OverallQual, 1000)
fitdistr(OQ1000, densfun = "log-normal")
## meanlog sdlog
## 1.785673403 0.232424204
## (0.007349899) (0.005197163)
hist(OQ1000, main = "Subset of the Overall Quality Population", xlab = "Overall Quality Rating")
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.3.3
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:psych':
##
## outlier
## The following object is masked from 'package:dplyr':
##
## combine
library(ModelMetrics)
## Warning: package 'ModelMetrics' was built under R version 3.3.3
train <- read.csv(text = url, head = TRUE, sep = ",", stringsAsFactors = FALSE)
train1 <- train %>% select_if(is.numeric) %>% dplyr::select(-Id)
train1[is.na(train1)] <- -1
fit <- randomForest(SalePrice ~ .,data=train1,
ntree=800, verbose=T)
summary(fit)
## Length Class Mode
## call 5 -none- call
## type 1 -none- character
## predicted 1460 -none- numeric
## mse 800 -none- numeric
## rsq 800 -none- numeric
## oob.times 1460 -none- numeric
## importance 36 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 11 -none- list
## coefs 0 -none- NULL
## y 1460 -none- numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
prd <- predict(fit,train1)
test1 <- test
test1[is.na(test1)] <- -1
testprd <- (predict(fit,test1))
testprd.df <- data.frame(testprd)
testprd.df$Id <- 1461:2919
names(testprd.df)[names(testprd.df)=="testprd"] <- "SalePrice"
testprd.df <- testprd.df[c(2, 1)]
head(testprd.df)
## Id SalePrice
## 1 1461 127633.5
## 2 1462 154264.9
## 3 1463 182640.7
## 4 1464 184318.0
## 5 1465 192584.3
## 6 1466 185055.2
rmse(train1$SalePrice,prd)
## [1] 12181.35
rmse(X,testprd)
## [1] 192959.7
varImpPlot(fit)
write.csv(testprd.df, file = "KLS_Results.csv")