require(tidyverse)                    
require(GGally) 
require(kableExtra)
require(psych)
require(reshape)
require(dplyr)
require(plotly)
require(ggplot2)
require(tidyr)
require(corrplot)
require(matrixcalc)
require(RColorBrewer)
require(MASS)
require(gmodels)
require(mice)
require(e1071)
require(randomForest)
require(vcd)

Problem 1

##Using R, generate a random variable X that has 10,000 random uniform numbers from 1 to N, where N can be any number of your choosing greater than or equal to 6. Then generate a random variable Y that has 10,000 random normal numbers with a mean of mu = sigma =(N+1)/2.

Generate Random Numbers

set.seed(123)
N <-25
X <-runif(10000,1,N)
Y <-rnorm(10000, (N+1)/2,(N+1)/2)
rnum <- data.frame(cbind(X,Y))
allnum <- nrow(rnum)

Probability. Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the median of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities.

5 points

x <-median(X)
y <-summary(Y)[2]

a. P(X>x | X>y)

XGy <- nrow(subset(rnum,X > y))/allnum ## P(X>y)
XxGYy <- nrow(subset(rnum,X>x & Y>y))/allnum ## P(X>x & X>y)
round(XxGYy/XGy,4)
## [1] 0.4354

b. P(X>x, Y>y)

nrow(subset(rnum,X>x & Y>y))/allnum
## [1] 0.3756

c. P(X<x | X>y)

XlXXGy<-nrow(subset(rnum,X<x & X>y))/allnum
round(XlXXGy/XGy,4)
## [1] 0.4204

5 points. Investigate whether P(X>x and Y>y)=P(X>x)P(Y>y) by building a table and evaluating the marginal and joint probabilities.

Xx <- nrow(subset(rnum,X>x))/allnum
Yy <- nrow(subset(rnum,Y>y))/allnum
XxYy <-nrow(subset(rnum,X>x & Y>y))/allnum
prod <- Xx*Yy
eq <- if(round(prod,2) == round(XxYy,2))
{
print ("True")
} else {
  print("False")
}
## [1] "True"
kable(cbind(Xx,Yy, prod, eq, XxYy), col.names = c("P(X>x)", "P(Y>y)","P(X>x)_P(Y>y)", "Equal", "P(X>x & Y>y)"))%>%
  kable_styling("responsive", full_width = F, position = "left")
P(X>x) P(Y>y) P(X>x)_P(Y>y) Equal P(X>x & Y>y)
0.5 0.75 0.375 True 0.3756

5 points. Check to see if independence holds by using Fisher’s Exact Test and the Chi Square Test. What is the difference between the two? Which is most appropriate?

gRx<- subset(rnum, X>x)
gRy<- subset(rnum, Y>y)
lEx<-subset(rnum, X <= x)
lEy<-subset(rnum, Y <=y )
conTable <- matrix (c(nrow(gRx),nrow(gRy),nrow(lEx),nrow(lEx)),nrow =2, ncol =2,
dimnames= list(c("x","y"),c("X>x,Y>y","X <= x, Y<=y")))
               

kable(conTable)%>%
   kable_styling("responsive", full_width = F, position="left")
X>x,Y>y X <= x, Y<=y
x 5000 5000
y 7500 5000

Chi-Squared Test

chisq.test(conTable)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  conTable
## X-squared = 224.6, df = 1, p-value < 2.2e-16

Fisher Exact Test

fisher.test(conTable)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  conTable
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.6319745 0.7032372
## sample estimates:
## odds ratio 
##  0.6666973

Fisher’s exact test is mostly used for 2×2 contingency table, while the chi-square test used for contingency table for various dimensions. For this the chi-squared test is more appropriate. The p-value for both test show that it is below the significant level so the null hypothesis is to be rejected, which shows that there is in fact a relationship between X and Y.

Problem 2

You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques . I want you to do the following.

5 points. Descriptive and Inferential Statistics. Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any three quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

mctrain <- read.csv("https://raw.githubusercontent.com/Luz917/data605final/master/train.csv")

mctest <- read.csv("https://raw.githubusercontent.com/Luz917/data605final/master/test.csv")

Univariate Descriptive Statistics and Plots

dim(mctrain)
## [1] 1460   81
glimpse(mctrain)
## Observations: 1,460
## Variables: 81
## $ Id            <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16...
## $ MSSubClass    <int> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, ...
## $ MSZoning      <fct> RL, RL, RL, RL, RL, RL, RL, RL, RM, RL, RL, RL, RL, R...
## $ LotFrontage   <int> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, NA, 9...
## $ LotArea       <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, ...
## $ Street        <fct> Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave,...
## $ Alley         <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ LotShape      <fct> Reg, Reg, IR1, IR1, IR1, IR1, Reg, IR1, Reg, Reg, Reg...
## $ LandContour   <fct> Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl...
## $ Utilities     <fct> AllPub, AllPub, AllPub, AllPub, AllPub, AllPub, AllPu...
## $ LotConfig     <fct> Inside, FR2, Inside, Corner, FR2, Inside, Inside, Cor...
## $ LandSlope     <fct> Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl...
## $ Neighborhood  <fct> CollgCr, Veenker, CollgCr, Crawfor, NoRidge, Mitchel,...
## $ Condition1    <fct> Norm, Feedr, Norm, Norm, Norm, Norm, Norm, PosN, Arte...
## $ Condition2    <fct> Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm,...
## $ BldgType      <fct> 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam,...
## $ HouseStyle    <fct> 2Story, 1Story, 2Story, 2Story, 2Story, 1.5Fin, 1Stor...
## $ OverallQual   <int> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, 6, 4,...
## $ OverallCond   <int> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, 7, 5,...
## $ YearBuilt     <int> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931,...
## $ YearRemodAdd  <int> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, 1950,...
## $ RoofStyle     <fct> Gable, Gable, Gable, Gable, Gable, Gable, Gable, Gabl...
## $ RoofMatl      <fct> CompShg, CompShg, CompShg, CompShg, CompShg, CompShg,...
## $ Exterior1st   <fct> VinylSd, MetalSd, VinylSd, Wd Sdng, VinylSd, VinylSd,...
## $ Exterior2nd   <fct> VinylSd, MetalSd, VinylSd, Wd Shng, VinylSd, VinylSd,...
## $ MasVnrType    <fct> BrkFace, None, BrkFace, None, BrkFace, None, Stone, S...
## $ MasVnrArea    <int> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, 0, 30...
## $ ExterQual     <fct> Gd, TA, Gd, TA, Gd, TA, Gd, TA, TA, TA, TA, Ex, TA, G...
## $ ExterCond     <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, T...
## $ Foundation    <fct> PConc, CBlock, PConc, BrkTil, PConc, Wood, PConc, CBl...
## $ BsmtQual      <fct> Gd, Gd, Gd, TA, Gd, Gd, Ex, Gd, TA, TA, TA, Ex, TA, G...
## $ BsmtCond      <fct> TA, TA, TA, Gd, TA, TA, TA, TA, TA, TA, TA, TA, TA, T...
## $ BsmtExposure  <fct> No, Gd, Mn, No, Av, No, Av, Mn, No, No, No, No, No, A...
## $ BsmtFinType1  <fct> GLQ, ALQ, GLQ, ALQ, GLQ, GLQ, GLQ, ALQ, Unf, GLQ, Rec...
## $ BsmtFinSF1    <int> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851, 906,...
## $ BsmtFinType2  <fct> Unf, Unf, Unf, Unf, Unf, Unf, Unf, BLQ, Unf, Unf, Unf...
## $ BsmtFinSF2    <int> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ BsmtUnfSF     <int> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140, 134,...
## $ TotalBsmtSF   <int> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952, 991,...
## $ Heating       <fct> GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA,...
## $ HeatingQC     <fct> Ex, Ex, Ex, Gd, Ex, Ex, Ex, Ex, Gd, Ex, Ex, Ex, TA, E...
## $ CentralAir    <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y,...
## $ Electrical    <fct> SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrk...
## $ X1stFlrSF     <int> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022, 107...
## $ X2ndFlrSF     <int> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, 1142,...
## $ LowQualFinSF  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ GrLivArea     <int> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774,...
## $ BsmtFullBath  <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0,...
## $ BsmtHalfBath  <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FullBath      <int> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 1, 2,...
## $ HalfBath      <int> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,...
## $ BedroomAbvGr  <int> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2,...
## $ KitchenAbvGr  <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2,...
## $ KitchenQual   <fct> Gd, TA, Gd, Gd, Gd, TA, Gd, TA, TA, TA, TA, Ex, TA, G...
## $ TotRmsAbvGrd  <int> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5, 5, 6...
## $ Functional    <fct> Typ, Typ, Typ, Typ, Typ, Typ, Typ, Typ, Min1, Typ, Ty...
## $ Fireplaces    <int> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, 1, 0,...
## $ FireplaceQu   <fct> NA, TA, TA, Gd, TA, NA, Gd, TA, TA, TA, NA, Gd, NA, G...
## $ GarageType    <fct> Attchd, Attchd, Attchd, Detchd, Attchd, Attchd, Attch...
## $ GarageYrBlt   <int> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, 1931,...
## $ GarageFinish  <fct> RFn, RFn, RFn, Unf, RFn, Unf, RFn, RFn, Unf, RFn, Unf...
## $ GarageCars    <int> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, 2, 2,...
## $ GarageArea    <int> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205, 384...
## $ GarageQual    <fct> TA, TA, TA, TA, TA, TA, TA, TA, Fa, Gd, TA, TA, TA, T...
## $ GarageCond    <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, T...
## $ PavedDrive    <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y,...
## $ WoodDeckSF    <int> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, 140, ...
## $ OpenPorchSF   <int> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, 33, 2...
## $ EnclosedPorch <int> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, 176, ...
## $ X3SsnPorch    <int> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ ScreenPorch   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, ...
## $ PoolArea      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ PoolQC        <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ Fence         <fct> NA, NA, NA, NA, NA, MnPrv, NA, NA, NA, NA, NA, NA, NA...
## $ MiscFeature   <fct> NA, NA, NA, NA, NA, Shed, NA, Shed, NA, NA, NA, NA, N...
## $ MiscVal       <int> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 0, 7...
## $ MoSold        <int> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, 7, 3,...
## $ YrSold        <int> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, 2008,...
## $ SaleType      <fct> WD, WD, WD, WD, WD, WD, WD, WD, WD, WD, WD, New, WD, ...
## $ SaleCondition <fct> Normal, Normal, Normal, Abnorml, Normal, Normal, Norm...
## $ SalePrice     <int> 208500, 181500, 223500, 140000, 250000, 143000, 30700...
summary(mctrain)
##        Id           MSSubClass       MSZoning     LotFrontage    
##  Min.   :   1.0   Min.   : 20.0   C (all):  10   Min.   : 21.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   FV     :  65   1st Qu.: 59.00  
##  Median : 730.5   Median : 50.0   RH     :  16   Median : 69.00  
##  Mean   : 730.5   Mean   : 56.9   RL     :1151   Mean   : 70.05  
##  3rd Qu.:1095.2   3rd Qu.: 70.0   RM     : 218   3rd Qu.: 80.00  
##  Max.   :1460.0   Max.   :190.0                  Max.   :313.00  
##                                                  NA's   :259     
##     LotArea        Street      Alley      LotShape  LandContour  Utilities   
##  Min.   :  1300   Grvl:   6   Grvl:  50   IR1:484   Bnk:  63    AllPub:1459  
##  1st Qu.:  7554   Pave:1454   Pave:  41   IR2: 41   HLS:  50    NoSeWa:   1  
##  Median :  9478               NA's:1369   IR3: 10   Low:  36                 
##  Mean   : 10517                           Reg:925   Lvl:1311                 
##  3rd Qu.: 11602                                                              
##  Max.   :215245                                                              
##                                                                              
##    LotConfig    LandSlope   Neighborhood   Condition1     Condition2  
##  Corner : 263   Gtl:1382   NAmes  :225   Norm   :1260   Norm   :1445  
##  CulDSac:  94   Mod:  65   CollgCr:150   Feedr  :  81   Feedr  :   6  
##  FR2    :  47   Sev:  13   OldTown:113   Artery :  48   Artery :   2  
##  FR3    :   4              Edwards:100   RRAn   :  26   PosN   :   2  
##  Inside :1052              Somerst: 86   PosN   :  19   RRNn   :   2  
##                            Gilbert: 79   RRAe   :  11   PosA   :   1  
##                            (Other):707   (Other):  15   (Other):   2  
##    BldgType      HouseStyle   OverallQual      OverallCond      YearBuilt   
##  1Fam  :1220   1Story :726   Min.   : 1.000   Min.   :1.000   Min.   :1872  
##  2fmCon:  31   2Story :445   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954  
##  Duplex:  52   1.5Fin :154   Median : 6.000   Median :5.000   Median :1973  
##  Twnhs :  43   SLvl   : 65   Mean   : 6.099   Mean   :5.575   Mean   :1971  
##  TwnhsE: 114   SFoyer : 37   3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000  
##                1.5Unf : 14   Max.   :10.000   Max.   :9.000   Max.   :2010  
##                (Other): 19                                                  
##   YearRemodAdd    RoofStyle       RoofMatl     Exterior1st   Exterior2nd 
##  Min.   :1950   Flat   :  13   CompShg:1434   VinylSd:515   VinylSd:504  
##  1st Qu.:1967   Gable  :1141   Tar&Grv:  11   HdBoard:222   MetalSd:214  
##  Median :1994   Gambrel:  11   WdShngl:   6   MetalSd:220   HdBoard:207  
##  Mean   :1985   Hip    : 286   WdShake:   5   Wd Sdng:206   Wd Sdng:197  
##  3rd Qu.:2004   Mansard:   7   ClyTile:   1   Plywood:108   Plywood:142  
##  Max.   :2010   Shed   :   2   Membran:   1   CemntBd: 61   CmentBd: 60  
##                                (Other):   2   (Other):128   (Other):136  
##    MasVnrType    MasVnrArea     ExterQual ExterCond  Foundation  BsmtQual  
##  BrkCmn : 15   Min.   :   0.0   Ex: 52    Ex:   3   BrkTil:146   Ex  :121  
##  BrkFace:445   1st Qu.:   0.0   Fa: 14    Fa:  28   CBlock:634   Fa  : 35  
##  None   :864   Median :   0.0   Gd:488    Gd: 146   PConc :647   Gd  :618  
##  Stone  :128   Mean   : 103.7   TA:906    Po:   1   Slab  : 24   TA  :649  
##  NA's   :  8   3rd Qu.: 166.0             TA:1282   Stone :  6   NA's: 37  
##                Max.   :1600.0                       Wood  :  3             
##                NA's   :8                                                   
##  BsmtCond    BsmtExposure BsmtFinType1   BsmtFinSF1     BsmtFinType2
##  Fa  :  45   Av  :221     ALQ :220     Min.   :   0.0   ALQ :  19   
##  Gd  :  65   Gd  :134     BLQ :148     1st Qu.:   0.0   BLQ :  33   
##  Po  :   2   Mn  :114     GLQ :418     Median : 383.5   GLQ :  14   
##  TA  :1311   No  :953     LwQ : 74     Mean   : 443.6   LwQ :  46   
##  NA's:  37   NA's: 38     Rec :133     3rd Qu.: 712.2   Rec :  54   
##                           Unf :430     Max.   :5644.0   Unf :1256   
##                           NA's: 37                      NA's:  38   
##    BsmtFinSF2        BsmtUnfSF       TotalBsmtSF      Heating     HeatingQC
##  Min.   :   0.00   Min.   :   0.0   Min.   :   0.0   Floor:   1   Ex:741   
##  1st Qu.:   0.00   1st Qu.: 223.0   1st Qu.: 795.8   GasA :1428   Fa: 49   
##  Median :   0.00   Median : 477.5   Median : 991.5   GasW :  18   Gd:241   
##  Mean   :  46.55   Mean   : 567.2   Mean   :1057.4   Grav :   7   Po:  1   
##  3rd Qu.:   0.00   3rd Qu.: 808.0   3rd Qu.:1298.2   OthW :   2   TA:428   
##  Max.   :1474.00   Max.   :2336.0   Max.   :6110.0   Wall :   4            
##                                                                            
##  CentralAir Electrical     X1stFlrSF      X2ndFlrSF     LowQualFinSF    
##  N:  95     FuseA:  94   Min.   : 334   Min.   :   0   Min.   :  0.000  
##  Y:1365     FuseF:  27   1st Qu.: 882   1st Qu.:   0   1st Qu.:  0.000  
##             FuseP:   3   Median :1087   Median :   0   Median :  0.000  
##             Mix  :   1   Mean   :1163   Mean   : 347   Mean   :  5.845  
##             SBrkr:1334   3rd Qu.:1391   3rd Qu.: 728   3rd Qu.:  0.000  
##             NA's :   1   Max.   :4692   Max.   :2065   Max.   :572.000  
##                                                                         
##    GrLivArea     BsmtFullBath     BsmtHalfBath        FullBath    
##  Min.   : 334   Min.   :0.0000   Min.   :0.00000   Min.   :0.000  
##  1st Qu.:1130   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:1.000  
##  Median :1464   Median :0.0000   Median :0.00000   Median :2.000  
##  Mean   :1515   Mean   :0.4253   Mean   :0.05753   Mean   :1.565  
##  3rd Qu.:1777   3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:2.000  
##  Max.   :5642   Max.   :3.0000   Max.   :2.00000   Max.   :3.000  
##                                                                   
##     HalfBath       BedroomAbvGr    KitchenAbvGr   KitchenQual  TotRmsAbvGrd   
##  Min.   :0.0000   Min.   :0.000   Min.   :0.000   Ex:100      Min.   : 2.000  
##  1st Qu.:0.0000   1st Qu.:2.000   1st Qu.:1.000   Fa: 39      1st Qu.: 5.000  
##  Median :0.0000   Median :3.000   Median :1.000   Gd:586      Median : 6.000  
##  Mean   :0.3829   Mean   :2.866   Mean   :1.047   TA:735      Mean   : 6.518  
##  3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:1.000               3rd Qu.: 7.000  
##  Max.   :2.0000   Max.   :8.000   Max.   :3.000               Max.   :14.000  
##                                                                               
##  Functional    Fireplaces    FireplaceQu   GarageType   GarageYrBlt  
##  Maj1:  14   Min.   :0.000   Ex  : 24    2Types :  6   Min.   :1900  
##  Maj2:   5   1st Qu.:0.000   Fa  : 33    Attchd :870   1st Qu.:1961  
##  Min1:  31   Median :1.000   Gd  :380    Basment: 19   Median :1980  
##  Min2:  34   Mean   :0.613   Po  : 20    BuiltIn: 88   Mean   :1979  
##  Mod :  15   3rd Qu.:1.000   TA  :313    CarPort:  9   3rd Qu.:2002  
##  Sev :   1   Max.   :3.000   NA's:690    Detchd :387   Max.   :2010  
##  Typ :1360                               NA's   : 81   NA's   :81    
##  GarageFinish   GarageCars      GarageArea     GarageQual  GarageCond 
##  Fin :352     Min.   :0.000   Min.   :   0.0   Ex  :   3   Ex  :   2  
##  RFn :422     1st Qu.:1.000   1st Qu.: 334.5   Fa  :  48   Fa  :  35  
##  Unf :605     Median :2.000   Median : 480.0   Gd  :  14   Gd  :   9  
##  NA's: 81     Mean   :1.767   Mean   : 473.0   Po  :   3   Po  :   7  
##               3rd Qu.:2.000   3rd Qu.: 576.0   TA  :1311   TA  :1326  
##               Max.   :4.000   Max.   :1418.0   NA's:  81   NA's:  81  
##                                                                       
##  PavedDrive   WoodDeckSF      OpenPorchSF     EnclosedPorch      X3SsnPorch    
##  N:  90     Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
##  P:  30     1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00  
##  Y:1340     Median :  0.00   Median : 25.00   Median :  0.00   Median :  0.00  
##             Mean   : 94.24   Mean   : 46.66   Mean   : 21.95   Mean   :  3.41  
##             3rd Qu.:168.00   3rd Qu.: 68.00   3rd Qu.:  0.00   3rd Qu.:  0.00  
##             Max.   :857.00   Max.   :547.00   Max.   :552.00   Max.   :508.00  
##                                                                                
##   ScreenPorch        PoolArea        PoolQC       Fence      MiscFeature
##  Min.   :  0.00   Min.   :  0.000   Ex  :   2   GdPrv:  59   Gar2:   2  
##  1st Qu.:  0.00   1st Qu.:  0.000   Fa  :   2   GdWo :  54   Othr:   2  
##  Median :  0.00   Median :  0.000   Gd  :   3   MnPrv: 157   Shed:  49  
##  Mean   : 15.06   Mean   :  2.759   NA's:1453   MnWw :  11   TenC:   1  
##  3rd Qu.:  0.00   3rd Qu.:  0.000               NA's :1179   NA's:1406  
##  Max.   :480.00   Max.   :738.000                                       
##                                                                         
##     MiscVal             MoSold           YrSold        SaleType   
##  Min.   :    0.00   Min.   : 1.000   Min.   :2006   WD     :1267  
##  1st Qu.:    0.00   1st Qu.: 5.000   1st Qu.:2007   New    : 122  
##  Median :    0.00   Median : 6.000   Median :2008   COD    :  43  
##  Mean   :   43.49   Mean   : 6.322   Mean   :2008   ConLD  :   9  
##  3rd Qu.:    0.00   3rd Qu.: 8.000   3rd Qu.:2009   ConLI  :   5  
##  Max.   :15500.00   Max.   :12.000   Max.   :2010   ConLw  :   5  
##                                                     (Other):   9  
##  SaleCondition    SalePrice     
##  Abnorml: 101   Min.   : 34900  
##  AdjLand:   4   1st Qu.:129975  
##  Alloca :  12   Median :163000  
##  Family :  20   Mean   :180921  
##  Normal :1198   3rd Qu.:214000  
##  Partial: 125   Max.   :755000  
## 
trianplot <- select_if(mctrain, is.numeric)

trainplot1 <- trianplot %>%
  keep(is.numeric) %>%                     
  gather() 

tp1 <- ggplot(trainplot1, aes(value)) +                     
    facet_wrap(~ key, scales = "free") +   
    geom_histogram()    

ggplotly(tp1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 348 rows containing non-finite values (stat_bin).
ggpairs(mctrain [,2:10], pch=10)

ggpairs(mctrain, columns = c(6:11), mapping=ggplot2::aes(colour = Street),pch=20)

scatterplot matrix for at least two of the independent variables and the dependent variable

  • Dependent Variable - SalePrice
  • Independent Variables - GrLivArea, BsmtUnfSF, MSSubClass
mctrain %>%
  dplyr::select(c("SalePrice", "GrLivArea", "BsmtUnfSF", "MSSubClass" )) %>%
  pairs.panels (method = "pearson", hist.col="lightblue")

#### Looking at this one can tell that these are skewed to the right, and normally distributed.

Basement Unfinishedvs. Sale Price

tbs <- ggplot(mctrain, aes(mctrain$BsmtUnfSF, mctrain$SalePrice)) +
  geom_point() +
  xlab("Basement Unfinished") +
  ylab("Sale Price") +
  ggtitle("Basemeent Unfished vs. Sale Price")

ggplotly(tbs)

Ground Living and Sale Price

tgs <- ggplot(mctrain, aes(mctrain$GrLivArea, mctrain$SalePrice)) +
  geom_point() +
  xlab("Ground Living Area") +
  ylab("Sale Price") +
  ggtitle("Ground Living Area vs. Sale Price")

ggplotly(tgs)

SubClass vs. Sale Price

tms <- ggplot(mctrain, aes(mctrain$MSSubClass, mctrain$SalePrice)) +
  geom_point() +
  xlab("SubClass") +
  ylab("Sale Price") +
  ggtitle("SubClass vs. Sale Price")

ggplotly(tms)

Derive a correlation matrix for any three quantitative variables in the dataset.

Variables used SalePrice, GrLivArea, BsmtUnfSF, MSSubClass

mctraincorr <- mctrain %>%
  dplyr::select(c("SalePrice","GrLivArea", "BsmtUnfSF", "MSSubClass")) %>%
  cor()

mctraincorr
##              SalePrice  GrLivArea  BsmtUnfSF  MSSubClass
## SalePrice   1.00000000 0.70862448  0.2144791 -0.08428414
## GrLivArea   0.70862448 1.00000000  0.2402573  0.07485318
## BsmtUnfSF   0.21447911 0.24025727  1.0000000 -0.14075948
## MSSubClass -0.08428414 0.07485318 -0.1407595  1.00000000
corrplot.mixed(mctraincorr, upper = "number", lower="color", lower.col = brewer.pal(n=4, name= "YlGnBu"), upper.col = brewer.pal(n=4, name="RdYlBu"))

Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

cor.test(mctrain$GrLivArea, mctrain$SalePrice, method = "pearson", conf.level = .80)
## 
##  Pearson's product-moment correlation
## 
## data:  mctrain$GrLivArea and mctrain$SalePrice
## t = 38.348, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.6915087 0.7249450
## sample estimates:
##       cor 
## 0.7086245
cor.test(mctrain$BsmtUnfSF, mctrain$SalePrice, method = "pearson", conf.level = .80)
## 
##  Pearson's product-moment correlation
## 
## data:  mctrain$BsmtUnfSF and mctrain$SalePrice
## t = 8.3847, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.1822292 0.2462680
## sample estimates:
##       cor 
## 0.2144791
cor.test(mctrain$MSSubClass, mctrain$SalePrice, method = "pearson", conf.level = .80)
## 
##  Pearson's product-moment correlation
## 
## data:  mctrain$MSSubClass and mctrain$SalePrice
## t = -3.2298, df = 1458, p-value = 0.001266
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  -0.11751336 -0.05086638
## sample estimates:
##         cor 
## -0.08428414

Looking at these correlations one can tell that all of the p_values are below the signicant values, so the null hypothesis would have to be rejected. Because of that reason there is no reason to worry about the familywise error.

5 points. Linear Algebra and Correlation. Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.

correlation matrix
mctraincorr
##              SalePrice  GrLivArea  BsmtUnfSF  MSSubClass
## SalePrice   1.00000000 0.70862448  0.2144791 -0.08428414
## GrLivArea   0.70862448 1.00000000  0.2402573  0.07485318
## BsmtUnfSF   0.21447911 0.24025727  1.0000000 -0.14075948
## MSSubClass -0.08428414 0.07485318 -0.1407595  1.00000000
Invert your correlation matrix
solve(mctraincorr)
##              SalePrice  GrLivArea   BsmtUnfSF MSSubClass
## SalePrice   2.09054636 -1.4901976 -0.05085229  0.2805880
## GrLivArea  -1.49019759  2.1372579 -0.23880517 -0.3191947
## BsmtUnfSF  -0.05085229 -0.2388052  1.09182687  0.1672743
## MSSubClass  0.28058798 -0.3191947  0.16727427  1.0710873
Multiply the correlation matrix by the precision matrix
mctraincorr %*% solve(mctraincorr)
##                SalePrice    GrLivArea    BsmtUnfSF   MSSubClass
## SalePrice   1.000000e+00 6.938894e-18 5.204170e-18 1.387779e-17
## GrLivArea   1.457168e-16 1.000000e+00 6.938894e-18 1.387779e-17
## BsmtUnfSF   1.387779e-17 6.245005e-17 1.000000e+00 0.000000e+00
## MSSubClass -5.551115e-17 0.000000e+00 0.000000e+00 1.000000e+00
Multiply the precision matrix by the correlation matrix
solve(mctraincorr) %*% mctraincorr
##                SalePrice     GrLivArea     BsmtUnfSF    MSSubClass
## SalePrice   1.000000e+00 -7.285839e-17 -5.551115e-17  0.000000e+00
## GrLivArea   2.220446e-16  1.000000e+00  9.020562e-17  5.551115e-17
## BsmtUnfSF   6.938894e-18  3.469447e-18  1.000000e+00 -2.775558e-17
## MSSubClass -2.775558e-17  0.000000e+00  2.775558e-17  1.000000e+00
Conduct LU decomposition on the matrix
luMat <- lu.decomposition(mctraincorr)

luMat$L%*%luMat$U
##             [,1]       [,2]       [,3]        [,4]
## [1,]  1.00000000 0.70862448  0.2144791 -0.08428414
## [2,]  0.70862448 1.00000000  0.2402573  0.07485318
## [3,]  0.21447911 0.24025727  1.0000000 -0.14075948
## [4,] -0.08428414 0.07485318 -0.1407595  1.00000000

5 points. Calculus-Based Probability & Statistics. Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of  for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, )). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

hist(mctrain$BsmtUnfSF, breaks = 30, main = "BsmtUnfSF", col = "blue")

fitdes <- fitdistr(mctrain$BsmtUnfSF, "exponential")
rate <-fitdes$estimate
rate
##        rate 
## 0.001762921
exponen <- rexp(1000, rate)

org <- mctrain$BsmtUnfSF
par(mfrow = c(1, 2))
hist(exponen, breaks = 30, xlim = c(0, 4000), main = "Exponential - BsmtUnfSF", 
    col = "purple")

hist(mctrain$BsmtUnfSF, breaks = 30, main = "Original - BsmtUnfSF", col = "red")

quantile(ecdf(exponen), c(0.05, .95))
##         5%        95% 
##   32.85567 1597.22999
ci(org, confidence = .95)
##   Estimate   CI lower   CI upper Std. Error 
##  567.24041  544.55620  589.92462   11.56419
quantile(org, c(.05, .95))
##   5%  95% 
##    0 1468

10 points. Modeling. Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.

First we prepare the data by joining the data first.

mctraintest <- bind_rows(mctrain, mctest)## combine train and test data
charvar<-mctraintest[,sapply(mctraintest, is.character)]## character variables get seperated
charvar[is.na(charvar)]<-"Not Applicable" ## NA's becomes a factor
factortt <-charvar %>%
  lapply(as.factor)%>%
  as.data.frame()

int<-mctraintest[, sapply(mctraintest, is.integer)] ## integers get seperated

mctraintest<-bind_cols(factortt,int) ##combine the factor and integer to the original

mmod<-mctraintest %>% ## the missing values are imputed 
  mice(method = "rf")
## 
##  iter imp variable
##   1   1  LotFrontage  MasVnrArea  BsmtFinSF1  BsmtFinSF2  BsmtUnfSF  TotalBsmtSF  BsmtFullBath  BsmtHalfBath  GarageYrBlt  GarageCars  GarageArea  SalePrice
##   1   2  LotFrontage  MasVnrArea  BsmtFinSF1  BsmtFinSF2  BsmtUnfSF  TotalBsmtSF  BsmtFullBath  BsmtHalfBath  GarageYrBlt  GarageCars  GarageArea  SalePrice
##   1   3  LotFrontage  MasVnrArea  BsmtFinSF1  BsmtFinSF2  BsmtUnfSF  TotalBsmtSF  BsmtFullBath  BsmtHalfBath  GarageYrBlt  GarageCars  GarageArea  SalePrice
##   1   4  LotFrontage  MasVnrArea  BsmtFinSF1  BsmtFinSF2  BsmtUnfSF  TotalBsmtSF  BsmtFullBath  BsmtHalfBath  GarageYrBlt  GarageCars  GarageArea  SalePrice
##   1   5  LotFrontage  MasVnrArea  BsmtFinSF1  BsmtFinSF2  BsmtUnfSF  TotalBsmtSF  BsmtFullBath  BsmtHalfBath  GarageYrBlt  GarageCars  GarageArea  SalePrice
##   2   1  LotFrontage  MasVnrArea  BsmtFinSF1  BsmtFinSF2  BsmtUnfSF  TotalBsmtSF  BsmtFullBath  BsmtHalfBath  GarageYrBlt  GarageCars  GarageArea  SalePrice
##   2   2  LotFrontage  MasVnrArea  BsmtFinSF1  BsmtFinSF2  BsmtUnfSF  TotalBsmtSF  BsmtFullBath  BsmtHalfBath  GarageYrBlt  GarageCars  GarageArea  SalePrice
##   2   3  LotFrontage  MasVnrArea  BsmtFinSF1  BsmtFinSF2  BsmtUnfSF  TotalBsmtSF  BsmtFullBath  BsmtHalfBath  GarageYrBlt  GarageCars  GarageArea  SalePrice
##   2   4  LotFrontage  MasVnrArea  BsmtFinSF1  BsmtFinSF2  BsmtUnfSF  TotalBsmtSF  BsmtFullBath  BsmtHalfBath  GarageYrBlt  GarageCars  GarageArea  SalePrice
##   2   5  LotFrontage  MasVnrArea  BsmtFinSF1  BsmtFinSF2  BsmtUnfSF  TotalBsmtSF  BsmtFullBath  BsmtHalfBath  GarageYrBlt  GarageCars  GarageArea  SalePrice
##   3   1  LotFrontage  MasVnrArea  BsmtFinSF1  BsmtFinSF2  BsmtUnfSF  TotalBsmtSF  BsmtFullBath  BsmtHalfBath  GarageYrBlt  GarageCars  GarageArea  SalePrice
##   3   2  LotFrontage  MasVnrArea  BsmtFinSF1  BsmtFinSF2  BsmtUnfSF  TotalBsmtSF  BsmtFullBath  BsmtHalfBath  GarageYrBlt  GarageCars  GarageArea  SalePrice
##   3   3  LotFrontage  MasVnrArea  BsmtFinSF1  BsmtFinSF2  BsmtUnfSF  TotalBsmtSF  BsmtFullBath  BsmtHalfBath  GarageYrBlt  GarageCars  GarageArea  SalePrice
##   3   4  LotFrontage  MasVnrArea  BsmtFinSF1  BsmtFinSF2  BsmtUnfSF  TotalBsmtSF  BsmtFullBath  BsmtHalfBath  GarageYrBlt  GarageCars  GarageArea  SalePrice
##   3   5  LotFrontage  MasVnrArea  BsmtFinSF1  BsmtFinSF2  BsmtUnfSF  TotalBsmtSF  BsmtFullBath  BsmtHalfBath  GarageYrBlt  GarageCars  GarageArea  SalePrice
##   4   1  LotFrontage  MasVnrArea  BsmtFinSF1  BsmtFinSF2  BsmtUnfSF  TotalBsmtSF  BsmtFullBath  BsmtHalfBath  GarageYrBlt  GarageCars  GarageArea  SalePrice
##   4   2  LotFrontage  MasVnrArea  BsmtFinSF1  BsmtFinSF2  BsmtUnfSF  TotalBsmtSF  BsmtFullBath  BsmtHalfBath  GarageYrBlt  GarageCars  GarageArea  SalePrice
##   4   3  LotFrontage  MasVnrArea  BsmtFinSF1  BsmtFinSF2  BsmtUnfSF  TotalBsmtSF  BsmtFullBath  BsmtHalfBath  GarageYrBlt  GarageCars  GarageArea  SalePrice
##   4   4  LotFrontage  MasVnrArea  BsmtFinSF1  BsmtFinSF2  BsmtUnfSF  TotalBsmtSF  BsmtFullBath  BsmtHalfBath  GarageYrBlt  GarageCars  GarageArea  SalePrice
##   4   5  LotFrontage  MasVnrArea  BsmtFinSF1  BsmtFinSF2  BsmtUnfSF  TotalBsmtSF  BsmtFullBath  BsmtHalfBath  GarageYrBlt  GarageCars  GarageArea  SalePrice
##   5   1  LotFrontage  MasVnrArea  BsmtFinSF1  BsmtFinSF2  BsmtUnfSF  TotalBsmtSF  BsmtFullBath  BsmtHalfBath  GarageYrBlt  GarageCars  GarageArea  SalePrice
##   5   2  LotFrontage  MasVnrArea  BsmtFinSF1  BsmtFinSF2  BsmtUnfSF  TotalBsmtSF  BsmtFullBath  BsmtHalfBath  GarageYrBlt  GarageCars  GarageArea  SalePrice
##   5   3  LotFrontage  MasVnrArea  BsmtFinSF1  BsmtFinSF2  BsmtUnfSF  TotalBsmtSF  BsmtFullBath  BsmtHalfBath  GarageYrBlt  GarageCars  GarageArea  SalePrice
##   5   4  LotFrontage  MasVnrArea  BsmtFinSF1  BsmtFinSF2  BsmtUnfSF  TotalBsmtSF  BsmtFullBath  BsmtHalfBath  GarageYrBlt  GarageCars  GarageArea  SalePrice
##   5   5  LotFrontage  MasVnrArea  BsmtFinSF1  BsmtFinSF2  BsmtUnfSF  TotalBsmtSF  BsmtFullBath  BsmtHalfBath  GarageYrBlt  GarageCars  GarageArea  SalePrice

Density of imputed values

densityplot(mmod)

Striplot of imputed data

stripplot(mmod, pch = 5, cex = 1.2)   

  • After imputing the data you have to complete the data, then we seperate the train and test data again.
mctraintest <- complete(mmod)

mctrain1 <-mctraintest[1:length(mctrain$SalePrice),]
mctest1 <- mctraintest[(length(mctrain$SalePrice)+1):nrow(mctraintest),]

Now we create the models, two different models will be created, first will be the svm model e1071 package then the rainforest package.

svmmod <- svm(SalePrice ~ ., data = mctrain1, cost = 3)
svmpred <- predict(svmmod, newdata = mctest1)

rfmod <- randomForest(SalePrice ~ ., data = mctrain1)
rfpred <- predict(rfmod, newdata = mctest1)

# create submission file
mcsubmission1 <- as.data.frame(cbind(mctest$Id, svmpred))
mcsubmission2 <- as.data.frame(cbind(mctest$Id, rfpred))

colnames(mcsubmission1) <- c("Id", "SalePrice")
colnames(mcsubmission2) <- c("Id", "SalePrice")

write.csv(mcsubmission1, file = "MC Submission 1.csv", quote = FALSE, row.names = FALSE)
write.csv(mcsubmission2, file = "MC Submission 2.csv", quote = FALSE, row.names = FALSE)

summary(svmpred)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   53747  134173  164306  180179  211928  486739
summary(rfpred)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   66489  131007  158944  179331  209911  474528

Out the the 2 submissions the rainforest model received the better score.

Username for Kaggle: maryluzcruz

knitr::include_graphics('Untitled.png')