Final Project

Loading required libraries

library(ggplot2)
library(MASS)
library(caret)
## Loading required package: lattice
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(matrixcalc)

Problem 1

Pick one of the quantitative independent variables (Xi) from the data set below, and define that variable as X. Also, pick one of the dependent variables (Yi) below, and define that as Y.

data<-read.csv("https://raw.githubusercontent.com/VioletaStoyanova/Data605/master/Problem1%20.csv", stringsAsFactors = FALSE)
data
##    ï..Y1   Y2   Y3   Y4   X1   X2   X3   X4
## 1   20.3 20.8 28.4 20.2  9.3  7.4  9.5  9.3
## 2   19.1 14.6 21.5 18.6  4.1  6.4  3.7 12.4
## 3   19.3 18.0 20.8 22.6 22.4  8.5 11.7 19.9
## 4   20.9  7.3 22.2 11.4  9.1  9.5  7.4  6.9
## 5   22.0 19.4 21.6 23.6 15.8 11.8  5.3 -1.0
## 6   23.5 13.5 21.8 24.0  7.1  8.8  7.4 10.6
## 7   13.8 14.7 25.2 26.0 15.9  8.4  7.4  6.4
## 8   18.8 15.3 22.5 26.8  6.9  5.1  8.6 10.6
## 9   20.9 12.6 21.1 19.7 16.0 11.4  9.1  1.2
## 10  18.6 13.0 21.7 22.7  6.7 15.1 11.4  7.7
## 11  22.3 13.1 21.4 16.8  8.2 12.6  8.4 15.5
## 12  17.6 10.3 20.8 20.2 16.0  8.0  7.3  6.9
## 13  20.8 14.9 23.0 21.7  6.4 10.3 11.3 13.7
## 14  28.7 14.8 17.4 20.9 11.8 10.4  4.4  3.7
## 15  15.2 16.2 21.3 26.9  3.5  9.5  9.3  4.4
## 16  20.9 15.7 15.1 16.3 21.7  9.5 10.9 11.5
## 17  18.4 16.3 17.8 19.9 12.2 15.1 10.9  4.2
## 18  10.3 11.5 26.4 15.5  9.3  6.6  7.7 13.9
## 19  26.3 12.2 21.6 26.5  8.0 15.4  7.7 12.9
## 20  28.1 11.8 22.5 21.7  6.2  8.2 11.5  1.2

Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 3d quartile of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities.

summary(data$X2)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.10    8.15    9.50    9.90   11.50   15.40
summary(data$Y2)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    7.30   12.50   14.65   14.30   15.82   20.80
#X2 is my x
#Y2 is my y
histogram(data$X2)

histogram(data$Y2)

a. P(X>x | Y>y) b. P(X>x, Y>y) c. P(Xy) Define the 3rd quartile for X2

(xQ3<-quantile(data$X2,0.75))
##  75% 
## 11.5

Define the 1st quartile for the Y2 variable

(yQ1<-quantile(data$Y2,0.25))
##  25% 
## 12.5

Formula for Conditional Probability

p(x|y)=p(x,y)/p(y) a.

numerator <- filter(data, Y2 > yQ1 & X2> xQ3)%>%
  tally()/nrow(data)

denominator <- filter(data, Y2 > yQ1)%>% 
  tally()/nrow(data)

(a <- numerator/denominator)
##           n
## 1 0.2666667
  1. P(X>x, Y>y)
Xx <- filter(data, X2 > xQ3) %>% tally()/nrow(data)
Yy <- filter(data, Y2 > yQ1) %>% tally()/nrow(data)

(b <- Xx * Yy)
##        n
## 1 0.1875
  1. P(Xy)
numerator <- filter(data, Y2 > yQ1 & X2 < xQ3) %>% tally()/nrow(data)

denominator <- filter(data, Y2 > yQ1) %>% tally()/nrow(data)

(c <- numerator/denominator)
##           n
## 1 0.7333333
c1 <- nrow(subset(data, X2<=xQ3 & Y2<=yQ1))
c2 <- nrow(subset(data, X2 <=xQ3 & Y2>yQ1))
c3 <- c1+c2
c4 <- nrow(subset(data, X2 >xQ3 & Y2<=yQ1))
c5 <- nrow(subset(data, X2 >xQ3 & Y2>yQ1))
c6 <- c4+c5
c7 <- c1+c4
c8 <- c2+c5
c9 <- c3+c6


dfcont<-matrix(round(c(c1, c2, c3, c4, c5, c6, c7, c8, c9), 3), ncol=3, nrow=3, byrow=TRUE)
colnames(dfcont) <-c (
"Y<=y",
"Y>y",
"Total")
rownames(dfcont) <-c ("X<=x","X>x","Total")

(dfcont <-  knitr::kable(as.table(dfcont)))
Y<=y Y>y Total
X<=x 4 11 15
X>x 1 4 5
Total 5 15 20
mat <- matrix(c(4, 11, 1, 4), 2, 2, byrow=T) 

chisq.test(mat, correct=TRUE) 
## Warning in chisq.test(mat, correct = TRUE): Chi-squared approximation may
## be incorrect
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  mat
## X-squared = 0, df = 1, p-value = 1

Does splitting the training data in this fashion make them independent? Let A be the new variable counting those observations above the 1st quartile for X, and let B be the new variable counting those observations above the 1st quartile for Y. Does P(AB)=P(A)P(B)? Check mathematically, and then evaluate by running a Chi Square test for association.

(xQ1<-quantile(data$X2,0.25))
##  25% 
## 8.15
c1 <- nrow(subset(data, X2<=xQ1 & Y2<=yQ1))
c2 <- nrow(subset(data, X2 <=xQ1 & Y2>yQ1))
c3 <- c1+c2
c4 <- nrow(subset(data, X2 >xQ1 & Y2<=yQ1))
c5 <- nrow(subset(data, X2 >xQ1 & Y2>yQ1))
c6 <- c4+c5
c7 <- c1+c4
c8 <- c2+c5
c9 <- c3+c6


dfcont<-matrix(round(c(c1, c2, c3, c4, c5, c6, c7, c8, c9), 3), ncol=3, nrow=3, byrow=TRUE)
colnames(dfcont) <-c (
"Y<=y",
"Y>y",
"Total")
rownames(dfcont) <-c ("X<=x","X>x","Total")

(dfcont <-  knitr::kable(as.table(dfcont)))
Y<=y Y>y Total
X<=x 2 3 5
X>x 3 12 15
Total 5 15 20
mat <- matrix(c(2, 3, 3, 12), 2, 2, byrow=T) 

chisq.test(mat, correct=TRUE) 
## Warning in chisq.test(mat, correct = TRUE): Chi-squared approximation may
## be incorrect
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  mat
## X-squared = 0.088889, df = 1, p-value = 0.7656

From our Chi square test we can see the p-value is larger than .05 which suggest to accept the H0 hypothesis, in other words the data is independent.

Problem 2

You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques . I want you to do the following. 5 points. Descriptive and Inferential Statistics. Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any THREE quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide a 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

train<-read.csv("https://raw.githubusercontent.com/VioletaStoyanova/Data605/master/train.csv", stringsAsFactors = FALSE)
head(train)
##   Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1  1         60       RL          65    8450   Pave  <NA>      Reg
## 2  2         20       RL          80    9600   Pave  <NA>      Reg
## 3  3         60       RL          68   11250   Pave  <NA>      IR1
## 4  4         70       RL          60    9550   Pave  <NA>      IR1
## 5  5         60       RL          84   14260   Pave  <NA>      IR1
## 6  6         50       RL          85   14115   Pave  <NA>      IR1
##   LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1         Lvl    AllPub    Inside       Gtl      CollgCr       Norm
## 2         Lvl    AllPub       FR2       Gtl      Veenker      Feedr
## 3         Lvl    AllPub    Inside       Gtl      CollgCr       Norm
## 4         Lvl    AllPub    Corner       Gtl      Crawfor       Norm
## 5         Lvl    AllPub       FR2       Gtl      NoRidge       Norm
## 6         Lvl    AllPub    Inside       Gtl      Mitchel       Norm
##   Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1       Norm     1Fam     2Story           7           5      2003
## 2       Norm     1Fam     1Story           6           8      1976
## 3       Norm     1Fam     2Story           7           5      2001
## 4       Norm     1Fam     2Story           7           5      1915
## 5       Norm     1Fam     2Story           8           5      2000
## 6       Norm     1Fam     1.5Fin           5           5      1993
##   YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1         2003     Gable  CompShg     VinylSd     VinylSd    BrkFace
## 2         1976     Gable  CompShg     MetalSd     MetalSd       None
## 3         2002     Gable  CompShg     VinylSd     VinylSd    BrkFace
## 4         1970     Gable  CompShg     Wd Sdng     Wd Shng       None
## 5         2000     Gable  CompShg     VinylSd     VinylSd    BrkFace
## 6         1995     Gable  CompShg     VinylSd     VinylSd       None
##   MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1        196        Gd        TA      PConc       Gd       TA           No
## 2          0        TA        TA     CBlock       Gd       TA           Gd
## 3        162        Gd        TA      PConc       Gd       TA           Mn
## 4          0        TA        TA     BrkTil       TA       Gd           No
## 5        350        Gd        TA      PConc       Gd       TA           Av
## 6          0        TA        TA       Wood       Gd       TA           No
##   BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1          GLQ        706          Unf          0       150         856
## 2          ALQ        978          Unf          0       284        1262
## 3          GLQ        486          Unf          0       434         920
## 4          ALQ        216          Unf          0       540         756
## 5          GLQ        655          Unf          0       490        1145
## 6          GLQ        732          Unf          0        64         796
##   Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 1    GasA        Ex          Y      SBrkr       856       854            0
## 2    GasA        Ex          Y      SBrkr      1262         0            0
## 3    GasA        Ex          Y      SBrkr       920       866            0
## 4    GasA        Gd          Y      SBrkr       961       756            0
## 5    GasA        Ex          Y      SBrkr      1145      1053            0
## 6    GasA        Ex          Y      SBrkr       796       566            0
##   GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1      1710            1            0        2        1            3
## 2      1262            0            1        2        0            3
## 3      1786            1            0        2        1            3
## 4      1717            1            0        1        0            3
## 5      2198            1            0        2        1            4
## 6      1362            1            0        1        1            1
##   KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1            1          Gd            8        Typ          0        <NA>
## 2            1          TA            6        Typ          1          TA
## 3            1          Gd            6        Typ          1          TA
## 4            1          Gd            7        Typ          1          Gd
## 5            1          Gd            9        Typ          1          TA
## 6            1          TA            5        Typ          0        <NA>
##   GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1     Attchd        2003          RFn          2        548         TA
## 2     Attchd        1976          RFn          2        460         TA
## 3     Attchd        2001          RFn          2        608         TA
## 4     Detchd        1998          Unf          3        642         TA
## 5     Attchd        2000          RFn          3        836         TA
## 6     Attchd        1993          Unf          2        480         TA
##   GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 1         TA          Y          0          61             0          0
## 2         TA          Y        298           0             0          0
## 3         TA          Y          0          42             0          0
## 4         TA          Y          0          35           272          0
## 5         TA          Y        192          84             0          0
## 6         TA          Y         40          30             0        320
##   ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1           0        0   <NA>  <NA>        <NA>       0      2   2008
## 2           0        0   <NA>  <NA>        <NA>       0      5   2007
## 3           0        0   <NA>  <NA>        <NA>       0      9   2008
## 4           0        0   <NA>  <NA>        <NA>       0      2   2006
## 5           0        0   <NA>  <NA>        <NA>       0     12   2008
## 6           0        0   <NA> MnPrv        Shed     700     10   2009
##   SaleType SaleCondition SalePrice
## 1       WD        Normal    208500
## 2       WD        Normal    181500
## 3       WD        Normal    223500
## 4       WD       Abnorml    140000
## 5       WD        Normal    250000
## 6       WD        Normal    143000
summary(train)
##        Id           MSSubClass      MSZoning          LotFrontage    
##  Min.   :   1.0   Min.   : 20.0   Length:1460        Min.   : 21.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   Class :character   1st Qu.: 59.00  
##  Median : 730.5   Median : 50.0   Mode  :character   Median : 69.00  
##  Mean   : 730.5   Mean   : 56.9                      Mean   : 70.05  
##  3rd Qu.:1095.2   3rd Qu.: 70.0                      3rd Qu.: 80.00  
##  Max.   :1460.0   Max.   :190.0                      Max.   :313.00  
##                                                      NA's   :259     
##     LotArea          Street             Alley             LotShape        
##  Min.   :  1300   Length:1460        Length:1460        Length:1460       
##  1st Qu.:  7554   Class :character   Class :character   Class :character  
##  Median :  9478   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 10517                                                           
##  3rd Qu.: 11602                                                           
##  Max.   :215245                                                           
##                                                                           
##  LandContour         Utilities          LotConfig        
##  Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##   LandSlope         Neighborhood        Condition1       
##  Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##   Condition2          BldgType          HouseStyle         OverallQual    
##  Length:1460        Length:1460        Length:1460        Min.   : 1.000  
##  Class :character   Class :character   Class :character   1st Qu.: 5.000  
##  Mode  :character   Mode  :character   Mode  :character   Median : 6.000  
##                                                           Mean   : 6.099  
##                                                           3rd Qu.: 7.000  
##                                                           Max.   :10.000  
##                                                                           
##   OverallCond      YearBuilt     YearRemodAdd   RoofStyle        
##  Min.   :1.000   Min.   :1872   Min.   :1950   Length:1460       
##  1st Qu.:5.000   1st Qu.:1954   1st Qu.:1967   Class :character  
##  Median :5.000   Median :1973   Median :1994   Mode  :character  
##  Mean   :5.575   Mean   :1971   Mean   :1985                     
##  3rd Qu.:6.000   3rd Qu.:2000   3rd Qu.:2004                     
##  Max.   :9.000   Max.   :2010   Max.   :2010                     
##                                                                  
##    RoofMatl         Exterior1st        Exterior2nd       
##  Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##   MasVnrType          MasVnrArea      ExterQual          ExterCond        
##  Length:1460        Min.   :   0.0   Length:1460        Length:1460       
##  Class :character   1st Qu.:   0.0   Class :character   Class :character  
##  Mode  :character   Median :   0.0   Mode  :character   Mode  :character  
##                     Mean   : 103.7                                        
##                     3rd Qu.: 166.0                                        
##                     Max.   :1600.0                                        
##                     NA's   :8                                             
##   Foundation          BsmtQual           BsmtCond        
##  Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  BsmtExposure       BsmtFinType1         BsmtFinSF1     BsmtFinType2      
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median : 383.5   Mode  :character  
##                                        Mean   : 443.6                     
##                                        3rd Qu.: 712.2                     
##                                        Max.   :5644.0                     
##                                                                           
##    BsmtFinSF2        BsmtUnfSF       TotalBsmtSF       Heating         
##  Min.   :   0.00   Min.   :   0.0   Min.   :   0.0   Length:1460       
##  1st Qu.:   0.00   1st Qu.: 223.0   1st Qu.: 795.8   Class :character  
##  Median :   0.00   Median : 477.5   Median : 991.5   Mode  :character  
##  Mean   :  46.55   Mean   : 567.2   Mean   :1057.4                     
##  3rd Qu.:   0.00   3rd Qu.: 808.0   3rd Qu.:1298.2                     
##  Max.   :1474.00   Max.   :2336.0   Max.   :6110.0                     
##                                                                        
##   HeatingQC          CentralAir         Electrical          X1stFlrSF   
##  Length:1460        Length:1460        Length:1460        Min.   : 334  
##  Class :character   Class :character   Class :character   1st Qu.: 882  
##  Mode  :character   Mode  :character   Mode  :character   Median :1087  
##                                                           Mean   :1163  
##                                                           3rd Qu.:1391  
##                                                           Max.   :4692  
##                                                                         
##    X2ndFlrSF     LowQualFinSF       GrLivArea     BsmtFullBath   
##  Min.   :   0   Min.   :  0.000   Min.   : 334   Min.   :0.0000  
##  1st Qu.:   0   1st Qu.:  0.000   1st Qu.:1130   1st Qu.:0.0000  
##  Median :   0   Median :  0.000   Median :1464   Median :0.0000  
##  Mean   : 347   Mean   :  5.845   Mean   :1515   Mean   :0.4253  
##  3rd Qu.: 728   3rd Qu.:  0.000   3rd Qu.:1777   3rd Qu.:1.0000  
##  Max.   :2065   Max.   :572.000   Max.   :5642   Max.   :3.0000  
##                                                                  
##   BsmtHalfBath        FullBath        HalfBath       BedroomAbvGr  
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :0.00000   Median :2.000   Median :0.0000   Median :3.000  
##  Mean   :0.05753   Mean   :1.565   Mean   :0.3829   Mean   :2.866  
##  3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :2.00000   Max.   :3.000   Max.   :2.0000   Max.   :8.000  
##                                                                    
##   KitchenAbvGr   KitchenQual         TotRmsAbvGrd     Functional       
##  Min.   :0.000   Length:1460        Min.   : 2.000   Length:1460       
##  1st Qu.:1.000   Class :character   1st Qu.: 5.000   Class :character  
##  Median :1.000   Mode  :character   Median : 6.000   Mode  :character  
##  Mean   :1.047                      Mean   : 6.518                     
##  3rd Qu.:1.000                      3rd Qu.: 7.000                     
##  Max.   :3.000                      Max.   :14.000                     
##                                                                        
##    Fireplaces    FireplaceQu         GarageType         GarageYrBlt  
##  Min.   :0.000   Length:1460        Length:1460        Min.   :1900  
##  1st Qu.:0.000   Class :character   Class :character   1st Qu.:1961  
##  Median :1.000   Mode  :character   Mode  :character   Median :1980  
##  Mean   :0.613                                         Mean   :1979  
##  3rd Qu.:1.000                                         3rd Qu.:2002  
##  Max.   :3.000                                         Max.   :2010  
##                                                        NA's   :81    
##  GarageFinish         GarageCars      GarageArea      GarageQual       
##  Length:1460        Min.   :0.000   Min.   :   0.0   Length:1460       
##  Class :character   1st Qu.:1.000   1st Qu.: 334.5   Class :character  
##  Mode  :character   Median :2.000   Median : 480.0   Mode  :character  
##                     Mean   :1.767   Mean   : 473.0                     
##                     3rd Qu.:2.000   3rd Qu.: 576.0                     
##                     Max.   :4.000   Max.   :1418.0                     
##                                                                        
##   GarageCond         PavedDrive          WoodDeckSF      OpenPorchSF    
##  Length:1460        Length:1460        Min.   :  0.00   Min.   :  0.00  
##  Class :character   Class :character   1st Qu.:  0.00   1st Qu.:  0.00  
##  Mode  :character   Mode  :character   Median :  0.00   Median : 25.00  
##                                        Mean   : 94.24   Mean   : 46.66  
##                                        3rd Qu.:168.00   3rd Qu.: 68.00  
##                                        Max.   :857.00   Max.   :547.00  
##                                                                         
##  EnclosedPorch      X3SsnPorch      ScreenPorch        PoolArea      
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.000  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.000  
##  Median :  0.00   Median :  0.00   Median :  0.00   Median :  0.000  
##  Mean   : 21.95   Mean   :  3.41   Mean   : 15.06   Mean   :  2.759  
##  3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.000  
##  Max.   :552.00   Max.   :508.00   Max.   :480.00   Max.   :738.000  
##                                                                      
##     PoolQC             Fence           MiscFeature       
##  Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##     MiscVal             MoSold           YrSold       SaleType        
##  Min.   :    0.00   Min.   : 1.000   Min.   :2006   Length:1460       
##  1st Qu.:    0.00   1st Qu.: 5.000   1st Qu.:2007   Class :character  
##  Median :    0.00   Median : 6.000   Median :2008   Mode  :character  
##  Mean   :   43.49   Mean   : 6.322   Mean   :2008                     
##  3rd Qu.:    0.00   3rd Qu.: 8.000   3rd Qu.:2009                     
##  Max.   :15500.00   Max.   :12.000   Max.   :2010                     
##                                                                       
##  SaleCondition        SalePrice     
##  Length:1460        Min.   : 34900  
##  Class :character   1st Qu.:129975  
##  Mode  :character   Median :163000  
##                     Mean   :180921  
##                     3rd Qu.:214000  
##                     Max.   :755000  
## 
par(mfrow=c(2, 2))
hist(train$BedroomAbvGr, col = "red")
boxplot(train$BedroomAbvGr, main="Boxplot LotArea")
qqnorm(train$BedroomAbvGr)
qqline(train$BedroomAbvGr)


par(mfrow=c(2, 2))

From the plots one can see that the data is skewed to the right

par(mfrow=c(2, 2))
hist(train$GarageArea, col = "blue")
boxplot(train$GarageArea, main="Boxplot LotArea")
qqnorm(train$GarageArea)
qqline(train$GarageArea)


par(mfrow=c(2, 2))

hist(train$SalePrice, col = "green")

boxplot(train$SalePrice, main="Boxplot LotArea")

qqnorm(train$SalePrice)
qqline(train$SalePrice)

plot(train$BedroomAbvGr, train$SalePrice, main = "Scatterplot SalePrice by BedroomAbvGr ")
abline(lm(train$SalePrice ~ train$BedroomAbvGr), col="red", lwd=3)

plot(train$GarageArea,train$SalePrice, main = "Scatterplot SalePrice by GarageArea ")
abline(lm(train$SalePrice ~ train$GarageArea), col="red", lwd=3)

Craeting a Multiple linear regession for the 2 independent variables

fit<-lm(SalePrice ~ BedroomAbvGr + GarageArea,data = train)
summary(fit) 
## 
## Call:
## lm(formula = SalePrice ~ BedroomAbvGr + GarageArea, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -266238  -33001   -4397   23587  477890 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  37074.363   6676.300   5.553 3.33e-08 ***
## BedroomAbvGr 12472.557   1972.131   6.324 3.37e-10 ***
## GarageArea     228.540      7.525  30.372  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 61320 on 1457 degrees of freedom
## Multiple R-squared:  0.405,  Adjusted R-squared:  0.4042 
## F-statistic: 495.9 on 2 and 1457 DF,  p-value: < 2.2e-16
plot(fit)

cor.test(train$BedroomAbvGr, train$SalePrice, method = "pearson" , conf.level = 0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  train$BedroomAbvGr and train$SalePrice
## t = 6.5159, df = 1458, p-value = 9.927e-11
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.1354160 0.2006421
## sample estimates:
##       cor 
## 0.1682132
cor.test(train$GarageArea, train$SalePrice, method = "pearson" , conf.level = 0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  train$GarageArea and train$SalePrice
## t = 30.446, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.6024756 0.6435283
## sample estimates:
##       cor 
## 0.6234314
cor.test(train$GarageCars, train$SalePrice, method = "pearson" , conf.level = 0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  train$GarageCars and train$SalePrice
## t = 31.839, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.6201771 0.6597899
## sample estimates:
##       cor 
## 0.6404092

One can observe that there is relatively high positive correlation between GarageCars, GarageArea and PriceSale but there is a weak correlation between BedroomAbvGr and PriceSale

Linear Algebra and Correlation. Invert your 3 x 3 correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.

cor_data <- cor(train[,c("BedroomAbvGr","GarageArea","GarageCars")])
cor_data
##              BedroomAbvGr GarageArea GarageCars
## BedroomAbvGr   1.00000000 0.06525253 0.08610644
## GarageArea     0.06525253 1.00000000 0.88247541
## GarageCars     0.08610644 0.88247541 1.00000000
dim(cor_data)
## [1] 3 3
#precision matrix
pre_data <- solve(cor_data)
pre_data
##              BedroomAbvGr  GarageArea GarageCars
## BedroomAbvGr   1.00799861  0.04890745 -0.1299548
## GarageArea     0.04890745  4.52240963 -3.9951266
## GarageCars    -0.12995479 -3.99512656  4.5367909
cor_data %*% pre_data
##              BedroomAbvGr    GarageArea GarageCars
## BedroomAbvGr            1  0.000000e+00          0
## GarageArea              0  1.000000e+00          0
## GarageCars              0 -8.881784e-16          1
pre_data %*% cor_data
##               BedroomAbvGr   GarageArea    GarageCars
## BedroomAbvGr  1.000000e+00 2.775558e-17  2.775558e-17
## GarageArea   -5.551115e-17 1.000000e+00 -1.332268e-15
## GarageCars    5.551115e-17 0.000000e+00  1.000000e+00
lu.decomposition(pre_data)
## $L
##             [,1]       [,2] [,3]
## [1,]  1.00000000  0.0000000    0
## [2,]  0.04851936  1.0000000    0
## [3,] -0.12892358 -0.8824754    1
## 
## $U
##          [,1]       [,2]       [,3]
## [1,] 1.007999 0.04890745 -0.1299548
## [2,] 0.000000 4.52003667 -3.9888212
## [3,] 0.000000 0.00000000  1.0000000

Calculus-Based Probability & Statistics. Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of ??? for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, ???)). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

#shift and find minimum value of chosen variable
BedroomAbvGr <- train$BedroomAbvGr + 1e-32
min(BedroomAbvGr)
## [1] 1e-32
(fit <- fitdistr(BedroomAbvGr, "exponential"))
##       rate    
##   0.348864994 
##  (0.009130214)
(lambda <- fit$estimate)
##     rate 
## 0.348865
samp <- rexp(1000, lambda)
par(mfrow=c(1, 2))
hist(samp, xlab = "BedroomAbvGr", main = "Simulated")
hist(train$BedroomAbvGr, xlab = "BedroomAbvGr", main = "Observed")

The Simulated Data is heavily skewed to the right where the Observed is more to the center.

(ecdf<-ecdf(samp))
## Empirical CDF 
## Call: ecdf(samp)
##  x[1:1000] = 0.0035525, 0.0057715, 0.0058203,  ..., 20.378, 21.837
quantile(samp,probs=seq(0,1,.05)) 
##           0%           5%          10%          15%          20% 
##  0.003552548  0.129693509  0.276155837  0.421016613  0.541419284 
##          25%          30%          35%          40%          45% 
##  0.722549328  0.878742289  1.076093290  1.299830970  1.606809266 
##          50%          55%          60%          65%          70% 
##  1.880711609  2.191986108  2.505532875  2.882929179  3.314130632 
##          75%          80%          85%          90%          95% 
##  3.747102636  4.318757937  5.227683710  6.503420020  8.386185133 
##         100% 
## 21.837246380

Modeling. Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.

I will build a multiple linear regression with variables that have higher than .5 percent correlation with the dependent variable SalePrice

#creating a dataframe with numeric variables
quantVar <- sapply(train, is.numeric)
quantVar_df <- train[ , quantVar]
head(quantVar_df)
##   Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt
## 1  1         60          65    8450           7           5      2003
## 2  2         20          80    9600           6           8      1976
## 3  3         60          68   11250           7           5      2001
## 4  4         70          60    9550           7           5      1915
## 5  5         60          84   14260           8           5      2000
## 6  6         50          85   14115           5           5      1993
##   YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1         2003        196        706          0       150         856
## 2         1976          0        978          0       284        1262
## 3         2002        162        486          0       434         920
## 4         1970          0        216          0       540         756
## 5         2000        350        655          0       490        1145
## 6         1995          0        732          0        64         796
##   X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath
## 1       856       854            0      1710            1            0
## 2      1262         0            0      1262            0            1
## 3       920       866            0      1786            1            0
## 4       961       756            0      1717            1            0
## 5      1145      1053            0      2198            1            0
## 6       796       566            0      1362            1            0
##   FullBath HalfBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd Fireplaces
## 1        2        1            3            1            8          0
## 2        2        0            3            1            6          1
## 3        2        1            3            1            6          1
## 4        1        0            3            1            7          1
## 5        2        1            4            1            9          1
## 6        1        1            1            1            5          0
##   GarageYrBlt GarageCars GarageArea WoodDeckSF OpenPorchSF EnclosedPorch
## 1        2003          2        548          0          61             0
## 2        1976          2        460        298           0             0
## 3        2001          2        608          0          42             0
## 4        1998          3        642          0          35           272
## 5        2000          3        836        192          84             0
## 6        1993          2        480         40          30             0
##   X3SsnPorch ScreenPorch PoolArea MiscVal MoSold YrSold SalePrice
## 1          0           0        0       0      2   2008    208500
## 2          0           0        0       0      5   2007    181500
## 3          0           0        0       0      9   2008    223500
## 4          0           0        0       0      2   2006    140000
## 5          0           0        0       0     12   2008    250000
## 6        320           0        0     700     10   2009    143000
#the next step is to find the correltion between the numeric variables and SalePrice and then choose the variables that have higher than .5 correlation
corSales <-data.frame(apply(quantVar_df,2, function(col)cor(col, quantVar_df$SalePrice, use = "complete.obs")))
colnames(corSales) <- c("cor")
corSales
##                       cor
## Id            -0.02191672
## MSSubClass    -0.08428414
## LotFrontage    0.35179910
## LotArea        0.26384335
## OverallQual    0.79098160
## OverallCond   -0.07785589
## YearBuilt      0.52289733
## YearRemodAdd   0.50710097
## MasVnrArea     0.47749305
## BsmtFinSF1     0.38641981
## BsmtFinSF2    -0.01137812
## BsmtUnfSF      0.21447911
## TotalBsmtSF    0.61358055
## X1stFlrSF      0.60585218
## X2ndFlrSF      0.31933380
## LowQualFinSF  -0.02560613
## GrLivArea      0.70862448
## BsmtFullBath   0.22712223
## BsmtHalfBath  -0.01684415
## FullBath       0.56066376
## HalfBath       0.28410768
## BedroomAbvGr   0.16821315
## KitchenAbvGr  -0.13590737
## TotRmsAbvGrd   0.53372316
## Fireplaces     0.46692884
## GarageYrBlt    0.48636168
## GarageCars     0.64040920
## GarageArea     0.62343144
## WoodDeckSF     0.32441344
## OpenPorchSF    0.31585623
## EnclosedPorch -0.12857796
## X3SsnPorch     0.04458367
## ScreenPorch    0.11144657
## PoolArea       0.09240355
## MiscVal       -0.02118958
## MoSold         0.04643225
## YrSold        -0.02892259
## SalePrice      1.00000000
(subset(corSales, cor > 0.5))
##                    cor
## OverallQual  0.7909816
## YearBuilt    0.5228973
## YearRemodAdd 0.5071010
## TotalBsmtSF  0.6135806
## X1stFlrSF    0.6058522
## GrLivArea    0.7086245
## FullBath     0.5606638
## TotRmsAbvGrd 0.5337232
## GarageCars   0.6404092
## GarageArea   0.6234314
## SalePrice    1.0000000
model <- lm(SalePrice ~ OverallQual + YearBuilt + YearRemodAdd + TotalBsmtSF + X1stFlrSF + GrLivArea + FullBath + TotRmsAbvGrd + GarageCars + GarageArea, data =train)

summary(model)
## 
## Call:
## lm(formula = SalePrice ~ OverallQual + YearBuilt + YearRemodAdd + 
##     TotalBsmtSF + X1stFlrSF + GrLivArea + FullBath + TotRmsAbvGrd + 
##     GarageCars + GarageArea, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -489958  -19316   -1948   16020  290558 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.186e+06  1.291e+05  -9.187  < 2e-16 ***
## OverallQual   1.960e+04  1.190e+03  16.472  < 2e-16 ***
## YearBuilt     2.682e+02  5.035e+01   5.328 1.15e-07 ***
## YearRemodAdd  2.965e+02  6.363e+01   4.659 3.47e-06 ***
## TotalBsmtSF   1.986e+01  4.295e+00   4.625 4.09e-06 ***
## X1stFlrSF     1.417e+01  4.930e+00   2.875 0.004097 ** 
## GrLivArea     5.130e+01  4.233e+00  12.119  < 2e-16 ***
## FullBath     -6.791e+03  2.682e+03  -2.532 0.011457 *  
## TotRmsAbvGrd  3.310e+01  1.119e+03   0.030 0.976404    
## GarageCars    1.042e+04  3.044e+03   3.422 0.000639 ***
## GarageArea    1.495e+01  1.031e+01   1.450 0.147384    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 37920 on 1449 degrees of freedom
## Multiple R-squared:  0.7737, Adjusted R-squared:  0.7721 
## F-statistic: 495.4 on 10 and 1449 DF,  p-value: < 2.2e-16

The R^2 is 0.7736 where 77.37% of the variance can be expalined by the model

test<-read.csv("https://raw.githubusercontent.com/VioletaStoyanova/Data605/master/test.csv", stringsAsFactors = FALSE)
head(test)
##     Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1 1461         20       RH          80   11622   Pave  <NA>      Reg
## 2 1462         20       RL          81   14267   Pave  <NA>      IR1
## 3 1463         60       RL          74   13830   Pave  <NA>      IR1
## 4 1464         60       RL          78    9978   Pave  <NA>      IR1
## 5 1465        120       RL          43    5005   Pave  <NA>      IR1
## 6 1466         60       RL          75   10000   Pave  <NA>      IR1
##   LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1         Lvl    AllPub    Inside       Gtl        NAmes      Feedr
## 2         Lvl    AllPub    Corner       Gtl        NAmes       Norm
## 3         Lvl    AllPub    Inside       Gtl      Gilbert       Norm
## 4         Lvl    AllPub    Inside       Gtl      Gilbert       Norm
## 5         HLS    AllPub    Inside       Gtl      StoneBr       Norm
## 6         Lvl    AllPub    Corner       Gtl      Gilbert       Norm
##   Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1       Norm     1Fam     1Story           5           6      1961
## 2       Norm     1Fam     1Story           6           6      1958
## 3       Norm     1Fam     2Story           5           5      1997
## 4       Norm     1Fam     2Story           6           6      1998
## 5       Norm   TwnhsE     1Story           8           5      1992
## 6       Norm     1Fam     2Story           6           5      1993
##   YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1         1961     Gable  CompShg     VinylSd     VinylSd       None
## 2         1958       Hip  CompShg     Wd Sdng     Wd Sdng    BrkFace
## 3         1998     Gable  CompShg     VinylSd     VinylSd       None
## 4         1998     Gable  CompShg     VinylSd     VinylSd    BrkFace
## 5         1992     Gable  CompShg     HdBoard     HdBoard       None
## 6         1994     Gable  CompShg     HdBoard     HdBoard       None
##   MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1          0        TA        TA     CBlock       TA       TA           No
## 2        108        TA        TA     CBlock       TA       TA           No
## 3          0        TA        TA      PConc       Gd       TA           No
## 4         20        TA        TA      PConc       TA       TA           No
## 5          0        Gd        TA      PConc       Gd       TA           No
## 6          0        TA        TA      PConc       Gd       TA           No
##   BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1          Rec        468          LwQ        144       270         882
## 2          ALQ        923          Unf          0       406        1329
## 3          GLQ        791          Unf          0       137         928
## 4          GLQ        602          Unf          0       324         926
## 5          ALQ        263          Unf          0      1017        1280
## 6          Unf          0          Unf          0       763         763
##   Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 1    GasA        TA          Y      SBrkr       896         0            0
## 2    GasA        TA          Y      SBrkr      1329         0            0
## 3    GasA        Gd          Y      SBrkr       928       701            0
## 4    GasA        Ex          Y      SBrkr       926       678            0
## 5    GasA        Ex          Y      SBrkr      1280         0            0
## 6    GasA        Gd          Y      SBrkr       763       892            0
##   GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1       896            0            0        1        0            2
## 2      1329            0            0        1        1            3
## 3      1629            0            0        2        1            3
## 4      1604            0            0        2        1            3
## 5      1280            0            0        2        0            2
## 6      1655            0            0        2        1            3
##   KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1            1          TA            5        Typ          0        <NA>
## 2            1          Gd            6        Typ          0        <NA>
## 3            1          TA            6        Typ          1          TA
## 4            1          Gd            7        Typ          1          Gd
## 5            1          Gd            5        Typ          0        <NA>
## 6            1          TA            7        Typ          1          TA
##   GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1     Attchd        1961          Unf          1        730         TA
## 2     Attchd        1958          Unf          1        312         TA
## 3     Attchd        1997          Fin          2        482         TA
## 4     Attchd        1998          Fin          2        470         TA
## 5     Attchd        1992          RFn          2        506         TA
## 6     Attchd        1993          Fin          2        440         TA
##   GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 1         TA          Y        140           0             0          0
## 2         TA          Y        393          36             0          0
## 3         TA          Y        212          34             0          0
## 4         TA          Y        360          36             0          0
## 5         TA          Y          0          82             0          0
## 6         TA          Y        157          84             0          0
##   ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1         120        0   <NA> MnPrv        <NA>       0      6   2010
## 2           0        0   <NA>  <NA>        Gar2   12500      6   2010
## 3           0        0   <NA> MnPrv        <NA>       0      3   2010
## 4           0        0   <NA>  <NA>        <NA>       0      6   2010
## 5         144        0   <NA>  <NA>        <NA>       0      1   2010
## 6           0        0   <NA>  <NA>        <NA>       0      4   2010
##   SaleType SaleCondition
## 1       WD        Normal
## 2       WD        Normal
## 3       WD        Normal
## 4       WD        Normal
## 5       WD        Normal
## 6       WD        Normal
mySalePrice <- predict(model,test)
#create dataframe
pricepred<- data.frame( Id = test[,"Id"],  SalePrice = mySalePrice)
pricepred[pricepred<0] <- 0
pricepred<- replace(pricepred,is.na(pricepred),0)
  
head(pricepred)
##     Id SalePrice
## 1 1461  110135.9
## 2 1462  159060.0
## 3 1463  169683.7
## 4 1464  188059.7
## 5 1465  219782.0
## 6 1466  182152.0
##write .csv for submission
write.csv(pricepred, file="pricepred.csv", row.names = FALSE)

My Kaggle score was 0.85356 and my name is violetastoyanova