require(tidyverse)
require(GGally)
require(kableExtra)
require(psych)
require(reshape)
require(dplyr)
require(plotly)
require(ggplot2)
require(tidyr)
require(corrplot)
require(matrixcalc)
require(RColorBrewer)
require(MASS)
require(gmodels)
require(mice)
require(e1071)
require(randomForest)
require(vcd)
##Using R, generate a random variable X that has 10,000 random uniform numbers from 1 to N, where N can be any number of your choosing greater than or equal to 6. Then generate a random variable Y that has 10,000 random normal numbers with a mean of mu = sigma =(N+1)/2.
set.seed(123)
N <-25
X <-runif(10000,1,N)
Y <-rnorm(10000, (N+1)/2,(N+1)/2)
rnum <- data.frame(cbind(X,Y))
allnum <- nrow(rnum)
5 points
x <-median(X)
y <-summary(Y)[2]
XGy <- nrow(subset(rnum,X > y))/allnum ## P(X>y)
XxGYy <- nrow(subset(rnum,X>x & Y>y))/allnum ## P(X>x & X>y)
round(XxGYy/XGy,4)
## [1] 0.4354
nrow(subset(rnum,X>x & Y>y))/allnum
## [1] 0.3756
XlXXGy<-nrow(subset(rnum,X<x & X>y))/allnum
round(XlXXGy/XGy,4)
## [1] 0.4204
Xx <- nrow(subset(rnum,X>x))/allnum
Yy <- nrow(subset(rnum,Y>y))/allnum
XxYy <-nrow(subset(rnum,X>x & Y>y))/allnum
prod <- Xx*Yy
eq <- if(round(prod,2) == round(XxYy,2))
{
print ("True")
} else {
print("False")
}
## [1] "True"
kable(cbind(Xx,Yy, prod, eq, XxYy), col.names = c("P(X>x)", "P(Y>y)","P(X>x)_P(Y>y)", "Equal", "P(X>x & Y>y)"))%>%
kable_styling("responsive", full_width = F, position = "left")
| P(X>x) | P(Y>y) | P(X>x)_P(Y>y) | Equal | P(X>x & Y>y) |
|---|---|---|---|---|
| 0.5 | 0.75 | 0.375 | True | 0.3756 |
gRx<- subset(rnum, X>x)
gRy<- subset(rnum, Y>y)
lEx<-subset(rnum, X <= x)
lEy<-subset(rnum, Y <=y )
conTable <- matrix (c(nrow(gRx),nrow(gRy),nrow(lEx),nrow(lEx)),nrow =2, ncol =2,
dimnames= list(c("x","y"),c("X>x,Y>y","X <= x, Y<=y")))
kable(conTable)%>%
kable_styling("responsive", full_width = F, position="left")
| X>x,Y>y | X <= x, Y<=y | |
|---|---|---|
| x | 5000 | 5000 |
| y | 7500 | 5000 |
chisq.test(conTable)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: conTable
## X-squared = 224.6, df = 1, p-value < 2.2e-16
fisher.test(conTable)
##
## Fisher's Exact Test for Count Data
##
## data: conTable
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.6319745 0.7032372
## sample estimates:
## odds ratio
## 0.6666973
You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques . I want you to do the following.
mctrain <- read.csv("https://raw.githubusercontent.com/Luz917/data605final/master/train.csv")
mctest <- read.csv("https://raw.githubusercontent.com/Luz917/data605final/master/test.csv")
dim(mctrain)
## [1] 1460 81
glimpse(mctrain)
## Observations: 1,460
## Variables: 81
## $ Id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16...
## $ MSSubClass <int> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, ...
## $ MSZoning <fct> RL, RL, RL, RL, RL, RL, RL, RL, RM, RL, RL, RL, RL, R...
## $ LotFrontage <int> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, NA, 9...
## $ LotArea <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, ...
## $ Street <fct> Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave,...
## $ Alley <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ LotShape <fct> Reg, Reg, IR1, IR1, IR1, IR1, Reg, IR1, Reg, Reg, Reg...
## $ LandContour <fct> Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl...
## $ Utilities <fct> AllPub, AllPub, AllPub, AllPub, AllPub, AllPub, AllPu...
## $ LotConfig <fct> Inside, FR2, Inside, Corner, FR2, Inside, Inside, Cor...
## $ LandSlope <fct> Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl...
## $ Neighborhood <fct> CollgCr, Veenker, CollgCr, Crawfor, NoRidge, Mitchel,...
## $ Condition1 <fct> Norm, Feedr, Norm, Norm, Norm, Norm, Norm, PosN, Arte...
## $ Condition2 <fct> Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm,...
## $ BldgType <fct> 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam,...
## $ HouseStyle <fct> 2Story, 1Story, 2Story, 2Story, 2Story, 1.5Fin, 1Stor...
## $ OverallQual <int> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, 6, 4,...
## $ OverallCond <int> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, 7, 5,...
## $ YearBuilt <int> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931,...
## $ YearRemodAdd <int> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, 1950,...
## $ RoofStyle <fct> Gable, Gable, Gable, Gable, Gable, Gable, Gable, Gabl...
## $ RoofMatl <fct> CompShg, CompShg, CompShg, CompShg, CompShg, CompShg,...
## $ Exterior1st <fct> VinylSd, MetalSd, VinylSd, Wd Sdng, VinylSd, VinylSd,...
## $ Exterior2nd <fct> VinylSd, MetalSd, VinylSd, Wd Shng, VinylSd, VinylSd,...
## $ MasVnrType <fct> BrkFace, None, BrkFace, None, BrkFace, None, Stone, S...
## $ MasVnrArea <int> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, 0, 30...
## $ ExterQual <fct> Gd, TA, Gd, TA, Gd, TA, Gd, TA, TA, TA, TA, Ex, TA, G...
## $ ExterCond <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, T...
## $ Foundation <fct> PConc, CBlock, PConc, BrkTil, PConc, Wood, PConc, CBl...
## $ BsmtQual <fct> Gd, Gd, Gd, TA, Gd, Gd, Ex, Gd, TA, TA, TA, Ex, TA, G...
## $ BsmtCond <fct> TA, TA, TA, Gd, TA, TA, TA, TA, TA, TA, TA, TA, TA, T...
## $ BsmtExposure <fct> No, Gd, Mn, No, Av, No, Av, Mn, No, No, No, No, No, A...
## $ BsmtFinType1 <fct> GLQ, ALQ, GLQ, ALQ, GLQ, GLQ, GLQ, ALQ, Unf, GLQ, Rec...
## $ BsmtFinSF1 <int> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851, 906,...
## $ BsmtFinType2 <fct> Unf, Unf, Unf, Unf, Unf, Unf, Unf, BLQ, Unf, Unf, Unf...
## $ BsmtFinSF2 <int> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ BsmtUnfSF <int> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140, 134,...
## $ TotalBsmtSF <int> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952, 991,...
## $ Heating <fct> GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA,...
## $ HeatingQC <fct> Ex, Ex, Ex, Gd, Ex, Ex, Ex, Ex, Gd, Ex, Ex, Ex, TA, E...
## $ CentralAir <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y,...
## $ Electrical <fct> SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrk...
## $ X1stFlrSF <int> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022, 107...
## $ X2ndFlrSF <int> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, 1142,...
## $ LowQualFinSF <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ GrLivArea <int> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774,...
## $ BsmtFullBath <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0,...
## $ BsmtHalfBath <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FullBath <int> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 1, 2,...
## $ HalfBath <int> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,...
## $ BedroomAbvGr <int> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2,...
## $ KitchenAbvGr <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2,...
## $ KitchenQual <fct> Gd, TA, Gd, Gd, Gd, TA, Gd, TA, TA, TA, TA, Ex, TA, G...
## $ TotRmsAbvGrd <int> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5, 5, 6...
## $ Functional <fct> Typ, Typ, Typ, Typ, Typ, Typ, Typ, Typ, Min1, Typ, Ty...
## $ Fireplaces <int> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, 1, 0,...
## $ FireplaceQu <fct> NA, TA, TA, Gd, TA, NA, Gd, TA, TA, TA, NA, Gd, NA, G...
## $ GarageType <fct> Attchd, Attchd, Attchd, Detchd, Attchd, Attchd, Attch...
## $ GarageYrBlt <int> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, 1931,...
## $ GarageFinish <fct> RFn, RFn, RFn, Unf, RFn, Unf, RFn, RFn, Unf, RFn, Unf...
## $ GarageCars <int> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, 2, 2,...
## $ GarageArea <int> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205, 384...
## $ GarageQual <fct> TA, TA, TA, TA, TA, TA, TA, TA, Fa, Gd, TA, TA, TA, T...
## $ GarageCond <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, T...
## $ PavedDrive <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y,...
## $ WoodDeckSF <int> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, 140, ...
## $ OpenPorchSF <int> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, 33, 2...
## $ EnclosedPorch <int> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, 176, ...
## $ X3SsnPorch <int> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ ScreenPorch <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, ...
## $ PoolArea <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ PoolQC <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ Fence <fct> NA, NA, NA, NA, NA, MnPrv, NA, NA, NA, NA, NA, NA, NA...
## $ MiscFeature <fct> NA, NA, NA, NA, NA, Shed, NA, Shed, NA, NA, NA, NA, N...
## $ MiscVal <int> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 0, 7...
## $ MoSold <int> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, 7, 3,...
## $ YrSold <int> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, 2008,...
## $ SaleType <fct> WD, WD, WD, WD, WD, WD, WD, WD, WD, WD, WD, New, WD, ...
## $ SaleCondition <fct> Normal, Normal, Normal, Abnorml, Normal, Normal, Norm...
## $ SalePrice <int> 208500, 181500, 223500, 140000, 250000, 143000, 30700...
summary(mctrain)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 C (all): 10 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 FV : 65 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 RH : 16 Median : 69.00
## Mean : 730.5 Mean : 56.9 RL :1151 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 RM : 218 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape LandContour Utilities
## Min. : 1300 Grvl: 6 Grvl: 50 IR1:484 Bnk: 63 AllPub:1459
## 1st Qu.: 7554 Pave:1454 Pave: 41 IR2: 41 HLS: 50 NoSeWa: 1
## Median : 9478 NA's:1369 IR3: 10 Low: 36
## Mean : 10517 Reg:925 Lvl:1311
## 3rd Qu.: 11602
## Max. :215245
##
## LotConfig LandSlope Neighborhood Condition1 Condition2
## Corner : 263 Gtl:1382 NAmes :225 Norm :1260 Norm :1445
## CulDSac: 94 Mod: 65 CollgCr:150 Feedr : 81 Feedr : 6
## FR2 : 47 Sev: 13 OldTown:113 Artery : 48 Artery : 2
## FR3 : 4 Edwards:100 RRAn : 26 PosN : 2
## Inside :1052 Somerst: 86 PosN : 19 RRNn : 2
## Gilbert: 79 RRAe : 11 PosA : 1
## (Other):707 (Other): 15 (Other): 2
## BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1Fam :1220 1Story :726 Min. : 1.000 Min. :1.000 Min. :1872
## 2fmCon: 31 2Story :445 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
## Duplex: 52 1.5Fin :154 Median : 6.000 Median :5.000 Median :1973
## Twnhs : 43 SLvl : 65 Mean : 6.099 Mean :5.575 Mean :1971
## TwnhsE: 114 SFoyer : 37 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
## 1.5Unf : 14 Max. :10.000 Max. :9.000 Max. :2010
## (Other): 19
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd
## Min. :1950 Flat : 13 CompShg:1434 VinylSd:515 VinylSd:504
## 1st Qu.:1967 Gable :1141 Tar&Grv: 11 HdBoard:222 MetalSd:214
## Median :1994 Gambrel: 11 WdShngl: 6 MetalSd:220 HdBoard:207
## Mean :1985 Hip : 286 WdShake: 5 Wd Sdng:206 Wd Sdng:197
## 3rd Qu.:2004 Mansard: 7 ClyTile: 1 Plywood:108 Plywood:142
## Max. :2010 Shed : 2 Membran: 1 CemntBd: 61 CmentBd: 60
## (Other): 2 (Other):128 (Other):136
## MasVnrType MasVnrArea ExterQual ExterCond Foundation BsmtQual
## BrkCmn : 15 Min. : 0.0 Ex: 52 Ex: 3 BrkTil:146 Ex :121
## BrkFace:445 1st Qu.: 0.0 Fa: 14 Fa: 28 CBlock:634 Fa : 35
## None :864 Median : 0.0 Gd:488 Gd: 146 PConc :647 Gd :618
## Stone :128 Mean : 103.7 TA:906 Po: 1 Slab : 24 TA :649
## NA's : 8 3rd Qu.: 166.0 TA:1282 Stone : 6 NA's: 37
## Max. :1600.0 Wood : 3
## NA's :8
## BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Fa : 45 Av :221 ALQ :220 Min. : 0.0 ALQ : 19
## Gd : 65 Gd :134 BLQ :148 1st Qu.: 0.0 BLQ : 33
## Po : 2 Mn :114 GLQ :418 Median : 383.5 GLQ : 14
## TA :1311 No :953 LwQ : 74 Mean : 443.6 LwQ : 46
## NA's: 37 NA's: 38 Rec :133 3rd Qu.: 712.2 Rec : 54
## Unf :430 Max. :5644.0 Unf :1256
## NA's: 37 NA's: 38
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Floor: 1 Ex:741
## 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 GasA :1428 Fa: 49
## Median : 0.00 Median : 477.5 Median : 991.5 GasW : 18 Gd:241
## Mean : 46.55 Mean : 567.2 Mean :1057.4 Grav : 7 Po: 1
## 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2 OthW : 2 TA:428
## Max. :1474.00 Max. :2336.0 Max. :6110.0 Wall : 4
##
## CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## N: 95 FuseA: 94 Min. : 334 Min. : 0 Min. : 0.000
## Y:1365 FuseF: 27 1st Qu.: 882 1st Qu.: 0 1st Qu.: 0.000
## FuseP: 3 Median :1087 Median : 0 Median : 0.000
## Mix : 1 Mean :1163 Mean : 347 Mean : 5.845
## SBrkr:1334 3rd Qu.:1391 3rd Qu.: 728 3rd Qu.: 0.000
## NA's : 1 Max. :4692 Max. :2065 Max. :572.000
##
## GrLivArea BsmtFullBath BsmtHalfBath FullBath
## Min. : 334 Min. :0.0000 Min. :0.00000 Min. :0.000
## 1st Qu.:1130 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:1.000
## Median :1464 Median :0.0000 Median :0.00000 Median :2.000
## Mean :1515 Mean :0.4253 Mean :0.05753 Mean :1.565
## 3rd Qu.:1777 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:2.000
## Max. :5642 Max. :3.0000 Max. :2.00000 Max. :3.000
##
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd
## Min. :0.0000 Min. :0.000 Min. :0.000 Ex:100 Min. : 2.000
## 1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:1.000 Fa: 39 1st Qu.: 5.000
## Median :0.0000 Median :3.000 Median :1.000 Gd:586 Median : 6.000
## Mean :0.3829 Mean :2.866 Mean :1.047 TA:735 Mean : 6.518
## 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :2.0000 Max. :8.000 Max. :3.000 Max. :14.000
##
## Functional Fireplaces FireplaceQu GarageType GarageYrBlt
## Maj1: 14 Min. :0.000 Ex : 24 2Types : 6 Min. :1900
## Maj2: 5 1st Qu.:0.000 Fa : 33 Attchd :870 1st Qu.:1961
## Min1: 31 Median :1.000 Gd :380 Basment: 19 Median :1980
## Min2: 34 Mean :0.613 Po : 20 BuiltIn: 88 Mean :1979
## Mod : 15 3rd Qu.:1.000 TA :313 CarPort: 9 3rd Qu.:2002
## Sev : 1 Max. :3.000 NA's:690 Detchd :387 Max. :2010
## Typ :1360 NA's : 81 NA's :81
## GarageFinish GarageCars GarageArea GarageQual GarageCond
## Fin :352 Min. :0.000 Min. : 0.0 Ex : 3 Ex : 2
## RFn :422 1st Qu.:1.000 1st Qu.: 334.5 Fa : 48 Fa : 35
## Unf :605 Median :2.000 Median : 480.0 Gd : 14 Gd : 9
## NA's: 81 Mean :1.767 Mean : 473.0 Po : 3 Po : 7
## 3rd Qu.:2.000 3rd Qu.: 576.0 TA :1311 TA :1326
## Max. :4.000 Max. :1418.0 NA's: 81 NA's: 81
##
## PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## N: 90 Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## P: 30 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
## Y:1340 Median : 0.00 Median : 25.00 Median : 0.00 Median : 0.00
## Mean : 94.24 Mean : 46.66 Mean : 21.95 Mean : 3.41
## 3rd Qu.:168.00 3rd Qu.: 68.00 3rd Qu.: 0.00 3rd Qu.: 0.00
## Max. :857.00 Max. :547.00 Max. :552.00 Max. :508.00
##
## ScreenPorch PoolArea PoolQC Fence MiscFeature
## Min. : 0.00 Min. : 0.000 Ex : 2 GdPrv: 59 Gar2: 2
## 1st Qu.: 0.00 1st Qu.: 0.000 Fa : 2 GdWo : 54 Othr: 2
## Median : 0.00 Median : 0.000 Gd : 3 MnPrv: 157 Shed: 49
## Mean : 15.06 Mean : 2.759 NA's:1453 MnWw : 11 TenC: 1
## 3rd Qu.: 0.00 3rd Qu.: 0.000 NA's :1179 NA's:1406
## Max. :480.00 Max. :738.000
##
## MiscVal MoSold YrSold SaleType
## Min. : 0.00 Min. : 1.000 Min. :2006 WD :1267
## 1st Qu.: 0.00 1st Qu.: 5.000 1st Qu.:2007 New : 122
## Median : 0.00 Median : 6.000 Median :2008 COD : 43
## Mean : 43.49 Mean : 6.322 Mean :2008 ConLD : 9
## 3rd Qu.: 0.00 3rd Qu.: 8.000 3rd Qu.:2009 ConLI : 5
## Max. :15500.00 Max. :12.000 Max. :2010 ConLw : 5
## (Other): 9
## SaleCondition SalePrice
## Abnorml: 101 Min. : 34900
## AdjLand: 4 1st Qu.:129975
## Alloca : 12 Median :163000
## Family : 20 Mean :180921
## Normal :1198 3rd Qu.:214000
## Partial: 125 Max. :755000
##
trianplot <- select_if(mctrain, is.numeric)
trainplot1 <- trianplot %>%
keep(is.numeric) %>%
gather()
tp1 <- ggplot(trainplot1, aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_histogram()
ggplotly(tp1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 348 rows containing non-finite values (stat_bin).
ggpairs(mctrain [,2:10], pch=10)
ggpairs(mctrain, columns = c(6:11), mapping=ggplot2::aes(colour = Street),pch=20)
mctrain %>%
dplyr::select(c("SalePrice", "GrLivArea", "BsmtUnfSF", "MSSubClass" )) %>%
pairs.panels (method = "pearson", hist.col="lightblue")
#### Looking at this one can tell that these are skewed to the right, and normally distributed.
tbs <- ggplot(mctrain, aes(mctrain$BsmtUnfSF, mctrain$SalePrice)) +
geom_point() +
xlab("Basement Unfinished") +
ylab("Sale Price") +
ggtitle("Basemeent Unfished vs. Sale Price")
ggplotly(tbs)
tgs <- ggplot(mctrain, aes(mctrain$GrLivArea, mctrain$SalePrice)) +
geom_point() +
xlab("Ground Living Area") +
ylab("Sale Price") +
ggtitle("Ground Living Area vs. Sale Price")
ggplotly(tgs)
tms <- ggplot(mctrain, aes(mctrain$MSSubClass, mctrain$SalePrice)) +
geom_point() +
xlab("SubClass") +
ylab("Sale Price") +
ggtitle("SubClass vs. Sale Price")
ggplotly(tms)
mctraincorr <- mctrain %>%
dplyr::select(c("SalePrice","GrLivArea", "BsmtUnfSF", "MSSubClass")) %>%
cor()
mctraincorr
## SalePrice GrLivArea BsmtUnfSF MSSubClass
## SalePrice 1.00000000 0.70862448 0.2144791 -0.08428414
## GrLivArea 0.70862448 1.00000000 0.2402573 0.07485318
## BsmtUnfSF 0.21447911 0.24025727 1.0000000 -0.14075948
## MSSubClass -0.08428414 0.07485318 -0.1407595 1.00000000
corrplot.mixed(mctraincorr, upper = "number", lower="color", lower.col = brewer.pal(n=4, name= "YlGnBu"), upper.col = brewer.pal(n=4, name="RdYlBu"))
cor.test(mctrain$GrLivArea, mctrain$SalePrice, method = "pearson", conf.level = .80)
##
## Pearson's product-moment correlation
##
## data: mctrain$GrLivArea and mctrain$SalePrice
## t = 38.348, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.6915087 0.7249450
## sample estimates:
## cor
## 0.7086245
cor.test(mctrain$BsmtUnfSF, mctrain$SalePrice, method = "pearson", conf.level = .80)
##
## Pearson's product-moment correlation
##
## data: mctrain$BsmtUnfSF and mctrain$SalePrice
## t = 8.3847, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.1822292 0.2462680
## sample estimates:
## cor
## 0.2144791
cor.test(mctrain$MSSubClass, mctrain$SalePrice, method = "pearson", conf.level = .80)
##
## Pearson's product-moment correlation
##
## data: mctrain$MSSubClass and mctrain$SalePrice
## t = -3.2298, df = 1458, p-value = 0.001266
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## -0.11751336 -0.05086638
## sample estimates:
## cor
## -0.08428414
mctraincorr
## SalePrice GrLivArea BsmtUnfSF MSSubClass
## SalePrice 1.00000000 0.70862448 0.2144791 -0.08428414
## GrLivArea 0.70862448 1.00000000 0.2402573 0.07485318
## BsmtUnfSF 0.21447911 0.24025727 1.0000000 -0.14075948
## MSSubClass -0.08428414 0.07485318 -0.1407595 1.00000000
solve(mctraincorr)
## SalePrice GrLivArea BsmtUnfSF MSSubClass
## SalePrice 2.09054636 -1.4901976 -0.05085229 0.2805880
## GrLivArea -1.49019759 2.1372579 -0.23880517 -0.3191947
## BsmtUnfSF -0.05085229 -0.2388052 1.09182687 0.1672743
## MSSubClass 0.28058798 -0.3191947 0.16727427 1.0710873
mctraincorr %*% solve(mctraincorr)
## SalePrice GrLivArea BsmtUnfSF MSSubClass
## SalePrice 1.000000e+00 6.938894e-18 5.204170e-18 1.387779e-17
## GrLivArea 1.457168e-16 1.000000e+00 6.938894e-18 1.387779e-17
## BsmtUnfSF 1.387779e-17 6.245005e-17 1.000000e+00 0.000000e+00
## MSSubClass -5.551115e-17 0.000000e+00 0.000000e+00 1.000000e+00
solve(mctraincorr) %*% mctraincorr
## SalePrice GrLivArea BsmtUnfSF MSSubClass
## SalePrice 1.000000e+00 -7.285839e-17 -5.551115e-17 0.000000e+00
## GrLivArea 2.220446e-16 1.000000e+00 9.020562e-17 5.551115e-17
## BsmtUnfSF 6.938894e-18 3.469447e-18 1.000000e+00 -2.775558e-17
## MSSubClass -2.775558e-17 0.000000e+00 2.775558e-17 1.000000e+00
luMat <- lu.decomposition(mctraincorr)
luMat$L%*%luMat$U
## [,1] [,2] [,3] [,4]
## [1,] 1.00000000 0.70862448 0.2144791 -0.08428414
## [2,] 0.70862448 1.00000000 0.2402573 0.07485318
## [3,] 0.21447911 0.24025727 1.0000000 -0.14075948
## [4,] -0.08428414 0.07485318 -0.1407595 1.00000000
hist(mctrain$BsmtUnfSF, breaks = 30, main = "BsmtUnfSF", col = "blue")
fitdes <- fitdistr(mctrain$BsmtUnfSF, "exponential")
rate <-fitdes$estimate
rate
## rate
## 0.001762921
exponen <- rexp(1000, rate)
org <- mctrain$BsmtUnfSF
par(mfrow = c(1, 2))
hist(exponen, breaks = 30, xlim = c(0, 4000), main = "Exponential - BsmtUnfSF",
col = "purple")
hist(mctrain$BsmtUnfSF, breaks = 30, main = "Original - BsmtUnfSF", col = "red")
quantile(ecdf(exponen), c(0.05, .95))
## 5% 95%
## 32.85567 1597.22999
ci(org, confidence = .95)
## Estimate CI lower CI upper Std. Error
## 567.24041 544.55620 589.92462 11.56419
quantile(org, c(.05, .95))
## 5% 95%
## 0 1468
mctraintest <- bind_rows(mctrain, mctest)## combine train and test data
charvar<-mctraintest[,sapply(mctraintest, is.character)]## character variables get seperated
charvar[is.na(charvar)]<-"Not Applicable" ## NA's becomes a factor
factortt <-charvar %>%
lapply(as.factor)%>%
as.data.frame()
int<-mctraintest[, sapply(mctraintest, is.integer)] ## integers get seperated
mctraintest<-bind_cols(factortt,int) ##combine the factor and integer to the original
mmod<-mctraintest %>% ## the missing values are imputed
mice(method = "rf")
##
## iter imp variable
## 1 1 LotFrontage MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath BsmtHalfBath GarageYrBlt GarageCars GarageArea SalePrice
## 1 2 LotFrontage MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath BsmtHalfBath GarageYrBlt GarageCars GarageArea SalePrice
## 1 3 LotFrontage MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath BsmtHalfBath GarageYrBlt GarageCars GarageArea SalePrice
## 1 4 LotFrontage MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath BsmtHalfBath GarageYrBlt GarageCars GarageArea SalePrice
## 1 5 LotFrontage MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath BsmtHalfBath GarageYrBlt GarageCars GarageArea SalePrice
## 2 1 LotFrontage MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath BsmtHalfBath GarageYrBlt GarageCars GarageArea SalePrice
## 2 2 LotFrontage MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath BsmtHalfBath GarageYrBlt GarageCars GarageArea SalePrice
## 2 3 LotFrontage MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath BsmtHalfBath GarageYrBlt GarageCars GarageArea SalePrice
## 2 4 LotFrontage MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath BsmtHalfBath GarageYrBlt GarageCars GarageArea SalePrice
## 2 5 LotFrontage MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath BsmtHalfBath GarageYrBlt GarageCars GarageArea SalePrice
## 3 1 LotFrontage MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath BsmtHalfBath GarageYrBlt GarageCars GarageArea SalePrice
## 3 2 LotFrontage MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath BsmtHalfBath GarageYrBlt GarageCars GarageArea SalePrice
## 3 3 LotFrontage MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath BsmtHalfBath GarageYrBlt GarageCars GarageArea SalePrice
## 3 4 LotFrontage MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath BsmtHalfBath GarageYrBlt GarageCars GarageArea SalePrice
## 3 5 LotFrontage MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath BsmtHalfBath GarageYrBlt GarageCars GarageArea SalePrice
## 4 1 LotFrontage MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath BsmtHalfBath GarageYrBlt GarageCars GarageArea SalePrice
## 4 2 LotFrontage MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath BsmtHalfBath GarageYrBlt GarageCars GarageArea SalePrice
## 4 3 LotFrontage MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath BsmtHalfBath GarageYrBlt GarageCars GarageArea SalePrice
## 4 4 LotFrontage MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath BsmtHalfBath GarageYrBlt GarageCars GarageArea SalePrice
## 4 5 LotFrontage MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath BsmtHalfBath GarageYrBlt GarageCars GarageArea SalePrice
## 5 1 LotFrontage MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath BsmtHalfBath GarageYrBlt GarageCars GarageArea SalePrice
## 5 2 LotFrontage MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath BsmtHalfBath GarageYrBlt GarageCars GarageArea SalePrice
## 5 3 LotFrontage MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath BsmtHalfBath GarageYrBlt GarageCars GarageArea SalePrice
## 5 4 LotFrontage MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath BsmtHalfBath GarageYrBlt GarageCars GarageArea SalePrice
## 5 5 LotFrontage MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath BsmtHalfBath GarageYrBlt GarageCars GarageArea SalePrice
densityplot(mmod)
stripplot(mmod, pch = 5, cex = 1.2)
mctraintest <- complete(mmod)
mctrain1 <-mctraintest[1:length(mctrain$SalePrice),]
mctest1 <- mctraintest[(length(mctrain$SalePrice)+1):nrow(mctraintest),]
svmmod <- svm(SalePrice ~ ., data = mctrain1, cost = 3)
svmpred <- predict(svmmod, newdata = mctest1)
rfmod <- randomForest(SalePrice ~ ., data = mctrain1)
rfpred <- predict(rfmod, newdata = mctest1)
# create submission file
mcsubmission1 <- as.data.frame(cbind(mctest$Id, svmpred))
mcsubmission2 <- as.data.frame(cbind(mctest$Id, rfpred))
colnames(mcsubmission1) <- c("Id", "SalePrice")
colnames(mcsubmission2) <- c("Id", "SalePrice")
write.csv(mcsubmission1, file = "MC Submission 1.csv", quote = FALSE, row.names = FALSE)
write.csv(mcsubmission2, file = "MC Submission 2.csv", quote = FALSE, row.names = FALSE)
summary(svmpred)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 53747 134173 164306 180179 211928 486739
summary(rfpred)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 66489 131007 158944 179331 209911 474528
Out the the 2 submissions the rainforest model received the better score.
Username for Kaggle: maryluzcruz
knitr::include_graphics('Untitled.png')