Problem 1.

Using R, generate a random variable X that has 10,000 random uniform numbers from 1 to N, where N can be any number of your choosing greater than or equal to 6. Then generate a random variable Y that has 10,000 random normal numbers with a mean of \[\mu=\sigma=(N+1)/2\]

library(gmodels)
library(ggplot2)
library(ggpubr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(broom)
library(ggfortify)
## Warning: package 'ggfortify' was built under R version 4.0.5
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble  3.0.6     v purrr   0.3.4
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.0.5
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.0.5
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.0.5
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(ResourceSelection)
## Warning: package 'ResourceSelection' was built under R version 4.0.5
## ResourceSelection 0.3-5   2019-07-22
library(psych)
## Warning: package 'psych' was built under R version 4.0.5
## 
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
## 
##     describe
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(matrixcalc)

set.seed(11706)
#6
N<-6
X<-runif(10000,1,N)
Y<-rnorm(10000,(N+1)/2,(N+1)/2)
hist(X)

summary(X)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.282   3.564   3.535   4.793   6.000
hist(Y)

summary(Y)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -9.886   1.218   3.618   3.550   5.917  18.255

Probability. Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the median of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities. 5 points

x<-median(X)
#median
x
## [1] 3.563532
y<-quantile(Y,.25)
#1st quartile

y
##      25% 
## 1.218006
  1. P(X>x | X>y)

\[= \frac{P(X>x)\cap P(X>y)}{ P(X>y)}\] \[= \frac{P(X>3.564)\cap P(X>1.218)}{ P(X>1.218)}\]

PAnB<-length(subset(X,X>x))
PAnB
## [1] 5000
PB<-length(subset(X,X>y))
PB
## [1] 9587
#a.   P(X>x | X>y)
Prob_Part_a<-(PAnB/PB)
Prob_Part_a
## [1] 0.5215396
  1. P(X>x, Y>y)

X, Y are independent, so

P(X>x, Y>y)=P(X>x)*P(Y>y)

PX=length(X[X>x])/10000
PX
## [1] 0.5
PY=length(Y[Y>y])/10000
PY
## [1] 0.75
Prob_Part_c=PX*PY
Prob_Part_c
## [1] 0.375
  1. P(X<x | X>y)
    \[= \frac{P(X<3.564)\cap P(X>1.218)}{ P(X>1.218)}\]
PAnB2<-length(subset(X,X<x))-length(subset(X,X<y))
PAnB2
## [1] 4587
PB<-length(subset(X,X>y))
PB
## [1] 9587
#c.   P(X<x | X>y)
Prob_Part_c<-(PAnB2/PB)
Prob_Part_c
## [1] 0.4784604

5 points. Investigate whether P(X>x and Y>y)=P(X>x)P(Y>y) by building a table and evaluating the marginal and joint probabilities.

joint <- matrix(
    c(sum(X>x & Y>y)/10000, sum(X<=x & Y>y)/10000, sum(Y>y)/10000,
         sum(X>x & Y<=y)/10000, sum(X<=x & Y<=y)/10000, sum(Y<=y)/10000,
         sum(X>x)/10000, sum(X<=x)/10000, 1.00), 
         ncol=3, byrow=TRUE)
colnames(joint) <- c("X>x", "X<=x", "Total")
rownames(joint) <- c("Y>y", "Y<=y", "Total")

joint 
##          X>x   X<=x Total
## Y>y   0.3776 0.3724  0.75
## Y<=y  0.1224 0.1276  0.25
## Total 0.5000 0.5000  1.00

From table based on simulation, P(X>x and Y>y)=.3776 P(X>x)P(Y>y)=.5*.75=.375

So, if independent, then P(X>x and Y>y)=P(X>x)P(Y>y). These are close, and based on the definition of rv X and rv Y, I would conclude independence.

#####################################################

5 points. Check to see if independence holds by using Fisher’s Exact Test and the Chi Square Test. What is the difference between the two? Which is most appropriate?

#create 2 categorical variables XCAT, YCAT
XY<-as.data.frame(cbind(X,Y))
head(XY)
##          X          Y
## 1 5.520336 -2.5471430
## 2 1.300727 -4.0508301
## 3 2.676213  6.1764164
## 4 2.006501 -0.4865294
## 5 3.940985  2.7240644
## 6 5.590427  6.8129352
XY2<-XY %>% mutate(XCAT = case_when(X>x ~ 'X>x',
                           X<=x ~ 'X<=x')) 
XY2<-XY2 %>% mutate(YCAT = case_when(Y>y ~ 'Y>y',
                           Y<=y ~ 'Y<=y')) 
head(XY2)
##          X          Y XCAT YCAT
## 1 5.520336 -2.5471430  X>x Y<=y
## 2 1.300727 -4.0508301 X<=x Y<=y
## 3 2.676213  6.1764164 X<=x  Y>y
## 4 2.006501 -0.4865294 X<=x Y<=y
## 5 3.940985  2.7240644  X>x  Y>y
## 6 5.590427  6.8129352  X>x  Y>y
UseTable<-table(XY2$XCAT,XY2$YCAT)

CrossTable(XY2$XCAT,XY2$YCAT)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  10000 
## 
##  
##              | XY2$YCAT 
##     XY2$XCAT |      Y<=y |       Y>y | Row Total | 
## -------------|-----------|-----------|-----------|
##         X<=x |      1276 |      3724 |      5000 | 
##              |     0.541 |     0.180 |           | 
##              |     0.255 |     0.745 |     0.500 | 
##              |     0.510 |     0.497 |           | 
##              |     0.128 |     0.372 |           | 
## -------------|-----------|-----------|-----------|
##          X>x |      1224 |      3776 |      5000 | 
##              |     0.541 |     0.180 |           | 
##              |     0.245 |     0.755 |     0.500 | 
##              |     0.490 |     0.503 |           | 
##              |     0.122 |     0.378 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |      2500 |      7500 |     10000 | 
##              |     0.250 |     0.750 |           | 
## -------------|-----------|-----------|-----------|
## 
## 
######################
##################Fisher's Test

fisher.test(UseTable)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  UseTable
## p-value = 0.2389
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.964496 1.158476
## sample estimates:
## odds ratio 
##   1.057068
######################
##################Chisq Test

chisq.test(UseTable)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  UseTable
## X-squared = 1.3872, df = 1, p-value = 0.2389
mosaicplot(UseTable, main = "Mosaic plot:  X uniform vs Y normal", color = TRUE)

Chisq is an appropriate test for independence based on the large N.

Fisher exact test is employed when cell <5, however, could be used large sample sizes as well (with a trade off of much more computing time).

There is an no association between XCAT and YCAT. This conclusion is confirmed by the non-significant chisq test. Furthermore, view the mosaic plot. The mosaic plot gives an overview of the data and makes it possible to recognize relationships between different variables. Notice equal proportion of XCATS for YCATS as depicted in the plot.

#####################################################

Problem 2. You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques . I want you to do the following.

5 points. Descriptive and Inferential Statistics. Provide univariate descriptive statistics and appropriate plots for the training data set.

##covid<-"https://raw.githubusercontent.com/lszydziak/data608/main/COVID-19_Reported_Patient_Impact_and_Hospital_Capacity_by_State_Timeseries_043021.csv"

train1<-"C:/Users/Lisa/Documents/CUNY/605/Final/Kaggle/train.csv"

train<-read.table(file=train1,header=TRUE, sep=",")

train<-train %>% mutate(BsmtLivArea=BsmtFinSF1+BsmtFinSF2,TotBath=FullBath+HalfBath+BsmtFullBath+BsmtHalfBath)

train<-train %>% mutate(DwellType = case_when(MSSubClass==20 ~ '1StoryNew',
         MSSubClass == 30 ~ '1StoryOld',
         MSSubClass == 40 ~ '1StoryAtt',
         MSSubClass == 45 ~ '1.5StoryUnfin',
         MSSubClass == 50 ~ '1.5StoryFin',
         MSSubClass == 60 ~ '2StoryNew',
         MSSubClass == 70 ~ '2StoryOld',
         MSSubClass == 75 ~ '2.5Story',
         MSSubClass == 80 ~ 'MultiLev',
         MSSubClass == 85 ~ 'SplitFoy',
         MSSubClass == 90 ~ 'Duplex',
         MSSubClass == 120 ~ '1SPUDNew',
         MSSubClass == 150 | MSSubClass == 180 ~ '2_1.5PUD',
         MSSubClass == 160 ~ '2PUDNew',
         MSSubClass == 190 ~ '2Fam')) 

train<-train %>% mutate(logSalePrice=log(SalePrice))


attach(train)
str(train)
## 'data.frame':    1460 obs. of  85 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ MSSubClass   : int  60 20 60 70 60 50 20 60 50 190 ...
##  $ MSZoning     : chr  "RL" "RL" "RL" "RL" ...
##  $ LotFrontage  : int  65 80 68 60 84 85 75 NA 51 50 ...
##  $ LotArea      : int  8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
##  $ Street       : chr  "Pave" "Pave" "Pave" "Pave" ...
##  $ Alley        : chr  NA NA NA NA ...
##  $ LotShape     : chr  "Reg" "Reg" "IR1" "IR1" ...
##  $ LandContour  : chr  "Lvl" "Lvl" "Lvl" "Lvl" ...
##  $ Utilities    : chr  "AllPub" "AllPub" "AllPub" "AllPub" ...
##  $ LotConfig    : chr  "Inside" "FR2" "Inside" "Corner" ...
##  $ LandSlope    : chr  "Gtl" "Gtl" "Gtl" "Gtl" ...
##  $ Neighborhood : chr  "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
##  $ Condition1   : chr  "Norm" "Feedr" "Norm" "Norm" ...
##  $ Condition2   : chr  "Norm" "Norm" "Norm" "Norm" ...
##  $ BldgType     : chr  "1Fam" "1Fam" "1Fam" "1Fam" ...
##  $ HouseStyle   : chr  "2Story" "1Story" "2Story" "2Story" ...
##  $ OverallQual  : int  7 6 7 7 8 5 8 7 7 5 ...
##  $ OverallCond  : int  5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt    : int  2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
##  $ YearRemodAdd : int  2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
##  $ RoofStyle    : chr  "Gable" "Gable" "Gable" "Gable" ...
##  $ RoofMatl     : chr  "CompShg" "CompShg" "CompShg" "CompShg" ...
##  $ Exterior1st  : chr  "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
##  $ Exterior2nd  : chr  "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
##  $ MasVnrType   : chr  "BrkFace" "None" "BrkFace" "None" ...
##  $ MasVnrArea   : int  196 0 162 0 350 0 186 240 0 0 ...
##  $ ExterQual    : chr  "Gd" "TA" "Gd" "TA" ...
##  $ ExterCond    : chr  "TA" "TA" "TA" "TA" ...
##  $ Foundation   : chr  "PConc" "CBlock" "PConc" "BrkTil" ...
##  $ BsmtQual     : chr  "Gd" "Gd" "Gd" "TA" ...
##  $ BsmtCond     : chr  "TA" "TA" "TA" "Gd" ...
##  $ BsmtExposure : chr  "No" "Gd" "Mn" "No" ...
##  $ BsmtFinType1 : chr  "GLQ" "ALQ" "GLQ" "ALQ" ...
##  $ BsmtFinSF1   : int  706 978 486 216 655 732 1369 859 0 851 ...
##  $ BsmtFinType2 : chr  "Unf" "Unf" "Unf" "Unf" ...
##  $ BsmtFinSF2   : int  0 0 0 0 0 0 0 32 0 0 ...
##  $ BsmtUnfSF    : int  150 284 434 540 490 64 317 216 952 140 ...
##  $ TotalBsmtSF  : int  856 1262 920 756 1145 796 1686 1107 952 991 ...
##  $ Heating      : chr  "GasA" "GasA" "GasA" "GasA" ...
##  $ HeatingQC    : chr  "Ex" "Ex" "Ex" "Gd" ...
##  $ CentralAir   : chr  "Y" "Y" "Y" "Y" ...
##  $ Electrical   : chr  "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
##  $ X1stFlrSF    : int  856 1262 920 961 1145 796 1694 1107 1022 1077 ...
##  $ X2ndFlrSF    : int  854 0 866 756 1053 566 0 983 752 0 ...
##  $ LowQualFinSF : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ GrLivArea    : int  1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
##  $ BsmtFullBath : int  1 0 1 1 1 1 1 1 0 1 ...
##  $ BsmtHalfBath : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ FullBath     : int  2 2 2 1 2 1 2 2 2 1 ...
##  $ HalfBath     : int  1 0 1 0 1 1 0 1 0 0 ...
##  $ BedroomAbvGr : int  3 3 3 3 4 1 3 3 2 2 ...
##  $ KitchenAbvGr : int  1 1 1 1 1 1 1 1 2 2 ...
##  $ KitchenQual  : chr  "Gd" "TA" "Gd" "Gd" ...
##  $ TotRmsAbvGrd : int  8 6 6 7 9 5 7 7 8 5 ...
##  $ Functional   : chr  "Typ" "Typ" "Typ" "Typ" ...
##  $ Fireplaces   : int  0 1 1 1 1 0 1 2 2 2 ...
##  $ FireplaceQu  : chr  NA "TA" "TA" "Gd" ...
##  $ GarageType   : chr  "Attchd" "Attchd" "Attchd" "Detchd" ...
##  $ GarageYrBlt  : int  2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
##  $ GarageFinish : chr  "RFn" "RFn" "RFn" "Unf" ...
##  $ GarageCars   : int  2 2 2 3 3 2 2 2 2 1 ...
##  $ GarageArea   : int  548 460 608 642 836 480 636 484 468 205 ...
##  $ GarageQual   : chr  "TA" "TA" "TA" "TA" ...
##  $ GarageCond   : chr  "TA" "TA" "TA" "TA" ...
##  $ PavedDrive   : chr  "Y" "Y" "Y" "Y" ...
##  $ WoodDeckSF   : int  0 298 0 0 192 40 255 235 90 0 ...
##  $ OpenPorchSF  : int  61 0 42 35 84 30 57 204 0 4 ...
##  $ EnclosedPorch: int  0 0 0 272 0 0 0 228 205 0 ...
##  $ X3SsnPorch   : int  0 0 0 0 0 320 0 0 0 0 ...
##  $ ScreenPorch  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolArea     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolQC       : chr  NA NA NA NA ...
##  $ Fence        : chr  NA NA NA NA ...
##  $ MiscFeature  : chr  NA NA NA NA ...
##  $ MiscVal      : int  0 0 0 0 0 700 0 350 0 0 ...
##  $ MoSold       : int  2 5 9 2 12 10 8 11 4 1 ...
##  $ YrSold       : int  2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
##  $ SaleType     : chr  "WD" "WD" "WD" "WD" ...
##  $ SaleCondition: chr  "Normal" "Normal" "Normal" "Abnorml" ...
##  $ SalePrice    : int  208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
##  $ BsmtLivArea  : int  706 978 486 216 655 732 1369 891 0 851 ...
##  $ TotBath      : int  4 3 4 2 4 3 3 4 2 2 ...
##  $ DwellType    : chr  "2StoryNew" "1StoryNew" "2StoryNew" "2StoryOld" ...
##  $ logSalePrice : num  12.2 12.1 12.3 11.8 12.4 ...
summary(train)
##        Id           MSSubClass      MSZoning          LotFrontage    
##  Min.   :   1.0   Min.   : 20.0   Length:1460        Min.   : 21.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   Class :character   1st Qu.: 59.00  
##  Median : 730.5   Median : 50.0   Mode  :character   Median : 69.00  
##  Mean   : 730.5   Mean   : 56.9                      Mean   : 70.05  
##  3rd Qu.:1095.2   3rd Qu.: 70.0                      3rd Qu.: 80.00  
##  Max.   :1460.0   Max.   :190.0                      Max.   :313.00  
##                                                      NA's   :259     
##     LotArea          Street             Alley             LotShape        
##  Min.   :  1300   Length:1460        Length:1460        Length:1460       
##  1st Qu.:  7554   Class :character   Class :character   Class :character  
##  Median :  9478   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 10517                                                           
##  3rd Qu.: 11602                                                           
##  Max.   :215245                                                           
##                                                                           
##  LandContour         Utilities          LotConfig          LandSlope        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Neighborhood        Condition1         Condition2          BldgType        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   HouseStyle         OverallQual      OverallCond      YearBuilt   
##  Length:1460        Min.   : 1.000   Min.   :1.000   Min.   :1872  
##  Class :character   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954  
##  Mode  :character   Median : 6.000   Median :5.000   Median :1973  
##                     Mean   : 6.099   Mean   :5.575   Mean   :1971  
##                     3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000  
##                     Max.   :10.000   Max.   :9.000   Max.   :2010  
##                                                                    
##   YearRemodAdd   RoofStyle           RoofMatl         Exterior1st       
##  Min.   :1950   Length:1460        Length:1460        Length:1460       
##  1st Qu.:1967   Class :character   Class :character   Class :character  
##  Median :1994   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1985                                                           
##  3rd Qu.:2004                                                           
##  Max.   :2010                                                           
##                                                                         
##  Exterior2nd         MasVnrType          MasVnrArea      ExterQual        
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median :   0.0   Mode  :character  
##                                        Mean   : 103.7                     
##                                        3rd Qu.: 166.0                     
##                                        Max.   :1600.0                     
##                                        NA's   :8                          
##   ExterCond          Foundation          BsmtQual           BsmtCond        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  BsmtExposure       BsmtFinType1         BsmtFinSF1     BsmtFinType2      
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median : 383.5   Mode  :character  
##                                        Mean   : 443.6                     
##                                        3rd Qu.: 712.2                     
##                                        Max.   :5644.0                     
##                                                                           
##    BsmtFinSF2        BsmtUnfSF       TotalBsmtSF       Heating         
##  Min.   :   0.00   Min.   :   0.0   Min.   :   0.0   Length:1460       
##  1st Qu.:   0.00   1st Qu.: 223.0   1st Qu.: 795.8   Class :character  
##  Median :   0.00   Median : 477.5   Median : 991.5   Mode  :character  
##  Mean   :  46.55   Mean   : 567.2   Mean   :1057.4                     
##  3rd Qu.:   0.00   3rd Qu.: 808.0   3rd Qu.:1298.2                     
##  Max.   :1474.00   Max.   :2336.0   Max.   :6110.0                     
##                                                                        
##   HeatingQC          CentralAir         Electrical          X1stFlrSF   
##  Length:1460        Length:1460        Length:1460        Min.   : 334  
##  Class :character   Class :character   Class :character   1st Qu.: 882  
##  Mode  :character   Mode  :character   Mode  :character   Median :1087  
##                                                           Mean   :1163  
##                                                           3rd Qu.:1391  
##                                                           Max.   :4692  
##                                                                         
##    X2ndFlrSF     LowQualFinSF       GrLivArea     BsmtFullBath   
##  Min.   :   0   Min.   :  0.000   Min.   : 334   Min.   :0.0000  
##  1st Qu.:   0   1st Qu.:  0.000   1st Qu.:1130   1st Qu.:0.0000  
##  Median :   0   Median :  0.000   Median :1464   Median :0.0000  
##  Mean   : 347   Mean   :  5.845   Mean   :1515   Mean   :0.4253  
##  3rd Qu.: 728   3rd Qu.:  0.000   3rd Qu.:1777   3rd Qu.:1.0000  
##  Max.   :2065   Max.   :572.000   Max.   :5642   Max.   :3.0000  
##                                                                  
##   BsmtHalfBath        FullBath        HalfBath       BedroomAbvGr  
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :0.00000   Median :2.000   Median :0.0000   Median :3.000  
##  Mean   :0.05753   Mean   :1.565   Mean   :0.3829   Mean   :2.866  
##  3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :2.00000   Max.   :3.000   Max.   :2.0000   Max.   :8.000  
##                                                                    
##   KitchenAbvGr   KitchenQual         TotRmsAbvGrd     Functional       
##  Min.   :0.000   Length:1460        Min.   : 2.000   Length:1460       
##  1st Qu.:1.000   Class :character   1st Qu.: 5.000   Class :character  
##  Median :1.000   Mode  :character   Median : 6.000   Mode  :character  
##  Mean   :1.047                      Mean   : 6.518                     
##  3rd Qu.:1.000                      3rd Qu.: 7.000                     
##  Max.   :3.000                      Max.   :14.000                     
##                                                                        
##    Fireplaces    FireplaceQu         GarageType         GarageYrBlt  
##  Min.   :0.000   Length:1460        Length:1460        Min.   :1900  
##  1st Qu.:0.000   Class :character   Class :character   1st Qu.:1961  
##  Median :1.000   Mode  :character   Mode  :character   Median :1980  
##  Mean   :0.613                                         Mean   :1979  
##  3rd Qu.:1.000                                         3rd Qu.:2002  
##  Max.   :3.000                                         Max.   :2010  
##                                                        NA's   :81    
##  GarageFinish         GarageCars      GarageArea      GarageQual       
##  Length:1460        Min.   :0.000   Min.   :   0.0   Length:1460       
##  Class :character   1st Qu.:1.000   1st Qu.: 334.5   Class :character  
##  Mode  :character   Median :2.000   Median : 480.0   Mode  :character  
##                     Mean   :1.767   Mean   : 473.0                     
##                     3rd Qu.:2.000   3rd Qu.: 576.0                     
##                     Max.   :4.000   Max.   :1418.0                     
##                                                                        
##   GarageCond         PavedDrive          WoodDeckSF      OpenPorchSF    
##  Length:1460        Length:1460        Min.   :  0.00   Min.   :  0.00  
##  Class :character   Class :character   1st Qu.:  0.00   1st Qu.:  0.00  
##  Mode  :character   Mode  :character   Median :  0.00   Median : 25.00  
##                                        Mean   : 94.24   Mean   : 46.66  
##                                        3rd Qu.:168.00   3rd Qu.: 68.00  
##                                        Max.   :857.00   Max.   :547.00  
##                                                                         
##  EnclosedPorch      X3SsnPorch      ScreenPorch        PoolArea      
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.000  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.000  
##  Median :  0.00   Median :  0.00   Median :  0.00   Median :  0.000  
##  Mean   : 21.95   Mean   :  3.41   Mean   : 15.06   Mean   :  2.759  
##  3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.000  
##  Max.   :552.00   Max.   :508.00   Max.   :480.00   Max.   :738.000  
##                                                                      
##     PoolQC             Fence           MiscFeature           MiscVal        
##  Length:1460        Length:1460        Length:1460        Min.   :    0.00  
##  Class :character   Class :character   Class :character   1st Qu.:    0.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    0.00  
##                                                           Mean   :   43.49  
##                                                           3rd Qu.:    0.00  
##                                                           Max.   :15500.00  
##                                                                             
##      MoSold           YrSold       SaleType         SaleCondition     
##  Min.   : 1.000   Min.   :2006   Length:1460        Length:1460       
##  1st Qu.: 5.000   1st Qu.:2007   Class :character   Class :character  
##  Median : 6.000   Median :2008   Mode  :character   Mode  :character  
##  Mean   : 6.322   Mean   :2008                                        
##  3rd Qu.: 8.000   3rd Qu.:2009                                        
##  Max.   :12.000   Max.   :2010                                        
##                                                                       
##    SalePrice       BsmtLivArea        TotBath       DwellType        
##  Min.   : 34900   Min.   :   0.0   Min.   :1.000   Length:1460       
##  1st Qu.:129975   1st Qu.:   0.0   1st Qu.:2.000   Class :character  
##  Median :163000   Median : 465.0   Median :2.000   Mode  :character  
##  Mean   :180921   Mean   : 490.2   Mean   :2.431                     
##  3rd Qu.:214000   3rd Qu.: 790.2   3rd Qu.:3.000                     
##  Max.   :755000   Max.   :5644.0   Max.   :6.000                     
##                                                                      
##   logSalePrice  
##  Min.   :10.46  
##  1st Qu.:11.78  
##  Median :12.00  
##  Mean   :12.02  
##  3rd Qu.:12.27  
##  Max.   :13.53  
## 
pairs(logSalePrice~LotArea+GrLivArea+BsmtLivArea+YrSold+X1stFlrSF+X2ndFlrSF+TotBath+YearRemodAdd+OverallQual+OverallCond,data=train,gap=0.4,cex.labels=1.5)

pairs.panels(train[,c("LotArea", "GrLivArea","BsmtLivArea","YrSold","TotBath", "logSalePrice")])

pairs.panels(train[,c("OverallQual", "OverallCond", "YearRemodAdd",  "FullBath", "logSalePrice")])

############
attach(train)
## The following objects are masked from train (pos = 3):
## 
##     Alley, BedroomAbvGr, BldgType, BsmtCond, BsmtExposure, BsmtFinSF1,
##     BsmtFinSF2, BsmtFinType1, BsmtFinType2, BsmtFullBath, BsmtHalfBath,
##     BsmtLivArea, BsmtQual, BsmtUnfSF, CentralAir, Condition1,
##     Condition2, DwellType, Electrical, EnclosedPorch, ExterCond,
##     Exterior1st, Exterior2nd, ExterQual, Fence, FireplaceQu,
##     Fireplaces, Foundation, FullBath, Functional, GarageArea,
##     GarageCars, GarageCond, GarageFinish, GarageQual, GarageType,
##     GarageYrBlt, GrLivArea, HalfBath, Heating, HeatingQC, HouseStyle,
##     Id, KitchenAbvGr, KitchenQual, LandContour, LandSlope,
##     logSalePrice, LotArea, LotConfig, LotFrontage, LotShape,
##     LowQualFinSF, MasVnrArea, MasVnrType, MiscFeature, MiscVal, MoSold,
##     MSSubClass, MSZoning, Neighborhood, OpenPorchSF, OverallCond,
##     OverallQual, PavedDrive, PoolArea, PoolQC, RoofMatl, RoofStyle,
##     SaleCondition, SalePrice, SaleType, ScreenPorch, Street,
##     TotalBsmtSF, TotBath, TotRmsAbvGrd, Utilities, WoodDeckSF,
##     X1stFlrSF, X2ndFlrSF, X3SsnPorch, YearBuilt, YearRemodAdd, YrSold
par(mfrow=c(2,3))

boxplot(logSalePrice~DwellType,ylab="Price",xlab="type of dwelling")

boxplot(logSalePrice~Neighborhood,ylab="Price",xlab="Neighborhood")
boxplot(logSalePrice~Condition1,ylab="Price",xlab="Condition")
boxplot(logSalePrice~Utilities,ylab="Price",xlab="Utilities")
boxplot(logSalePrice~BldgType,ylab="Price",xlab="BldgType")
######################
par(mfrow=c(2,3))

boxplot(logSalePrice~Exterior1st,ylab="Price",xlab="Exterior")
boxplot(logSalePrice~OverallQual,ylab="Price",xlab="OverallQuality")
boxplot(logSalePrice~OverallCond,ylab="Price",xlab="OverallCondition")
boxplot(logSalePrice~HouseStyle,ylab="Price",xlab="HouseStyle")
boxplot(logSalePrice~SaleCondition,ylab="Price",xlab="SaleCondition")

hist(train$SalePrice)

hist(train$logSalePrice)

Provide a scatterplot matrix for at least two of the independent variables and the dependent variable.

plot(train$GrLivArea,train$logSalePrice)

plot(train$TotBath,train$logSalePrice)  

plot(train$OverallQual,train$logSalePrice)

Derive a correlation matrix for any three quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

matcorr<-data.frame(train$logSalePrice,train$OverallQual,train$GrLivArea)

corr<-cor(matcorr)

cor.test(train$logSalePrice,train$OverallQual, conf.level = .8)
## 
##  Pearson's product-moment correlation
## 
## data:  train$logSalePrice and train$OverallQual
## t = 54.137, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.8057205 0.8280363
## sample estimates:
##       cor 
## 0.8171844
cor.test(train$logSalePrice,train$GrLivArea, conf.level = .8)
## 
##  Pearson's product-moment correlation
## 
## data:  train$logSalePrice and train$GrLivArea
## t = 37.525, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.6834425 0.7176071
## sample estimates:
##       cor 
## 0.7009267

Correlation between logSalePrice and OverallQual is .82 with 80% CI [.81,.83]

Correlation between logSalePrice and GrLivArea is .70 with 80% CI [.68,.72]

Correlation is pretty high, I would consider including these two variables in the model.

It depends upon my intention for the model as to my concern for family-wise error. For example, if I was concerned about the p-values, I would be concerned and apply a correction.

5 points. Linear Algebra and Correlation. Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.

#RECAP correlation matrix, inverse of correlation Matrix
corr
##                    train.logSalePrice train.OverallQual train.GrLivArea
## train.logSalePrice          1.0000000         0.8171844       0.7009267
## train.OverallQual           0.8171844         1.0000000       0.5930074
## train.GrLivArea             0.7009267         0.5930074       1.0000000
invcorr<-matrix.inverse(corr)
invcorr
##                    train.logSalePrice train.OverallQual train.GrLivArea
## train.logSalePrice           3.845748        -2.3817394      -1.2831982
## train.OverallQual           -2.381739         3.0174484      -0.1199447
## train.GrLivArea             -1.283198        -0.1199447       1.9705559
#Multiply correlation matrix X precision matrix

corrXinvcorr<-corr %*% invcorr
corrXinvcorr
##                    train.logSalePrice train.OverallQual train.GrLivArea
## train.logSalePrice       1.000000e+00      1.804112e-16    2.220446e-16
## train.OverallQual       -2.220446e-16      1.000000e+00    2.220446e-16
## train.GrLivArea         -2.220446e-16      2.081668e-16    1.000000e+00
#Multiply precision matrix X correlation matrix

invcorrXcorr<-invcorr %*% corr
invcorrXcorr
##                    train.logSalePrice train.OverallQual train.GrLivArea
## train.logSalePrice       1.000000e+00     -5.551115e-16   -2.220446e-16
## train.OverallQual        6.106227e-16      1.000000e+00    4.163336e-16
## train.GrLivArea          0.000000e+00      0.000000e+00    1.000000e+00
#LU decomposition on correlation matrix

LU<-lu.decomposition(corr)
print(LU$L)
##           [,1]       [,2] [,3]
## [1,] 1.0000000 0.00000000    0
## [2,] 0.8171844 1.00000000    0
## [3,] 0.7009267 0.06086847    1
print(LU$U)
##      [,1]      [,2]       [,3]
## [1,]    1 0.8171844 0.70092665
## [2,]    0 0.3322096 0.02022109
## [3,]    0 0.0000000 0.50747100

5 points. Calculus-Based Probability & Statistics. Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of  for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, )). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

library(MASS)
## Warning: package 'MASS' was built under R version 4.0.5
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
hist(train$SalePrice)

to_fit<-fitdistr(train$SalePrice,densfun="exponential")

to_fit
##        rate    
##   5.527268e-06 
##  (1.446552e-07)
names(to_fit)
## [1] "estimate" "sd"       "vcov"     "n"        "loglik"
##lambda
to_fit$estimate
##         rate 
## 5.527268e-06
sample_exp<-rexp(1000,to_fit$estimate)

#compare histograms
par(mar=c(1, 1, 1, 1))
hist(sample_exp)

hist(train$SalePrice)

#find 5th and 95th percentiles

qexp(c(0.05, 0.95), to_fit$estimate)
## [1]   9280.044 541991.465
# plotting empirical CDF

plot(ecdf(sample_exp))

#Summary
summary(sample_exp)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##     332.2   54481.9  123613.7  181311.0  252706.0 1231162.2
mean(sample_exp)
## [1] 181311
library(Rmisc)
## Warning: package 'Rmisc' was built under R version 4.0.5
## Loading required package: plyr
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:Hmisc':
## 
##     is.discrete, summarize
## The following object is masked from 'package:purrr':
## 
##     compact
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following object is masked from 'package:ggpubr':
## 
##     mutate
#MeanCI(sample_exp,
#       conf.level=0.95)
#mean and 95% confidence interval assuming normality
CI(sample_exp,
   ci=0.95)
##    upper     mean    lower 
## 192742.8 181311.0 169879.3

The distribution is clearly not normal. To estimate the confidence interval based on normality is not appropriate.

10 points. Modeling. Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.

attach(train)
## The following objects are masked from train (pos = 6):
## 
##     Alley, BedroomAbvGr, BldgType, BsmtCond, BsmtExposure, BsmtFinSF1,
##     BsmtFinSF2, BsmtFinType1, BsmtFinType2, BsmtFullBath, BsmtHalfBath,
##     BsmtLivArea, BsmtQual, BsmtUnfSF, CentralAir, Condition1,
##     Condition2, DwellType, Electrical, EnclosedPorch, ExterCond,
##     Exterior1st, Exterior2nd, ExterQual, Fence, FireplaceQu,
##     Fireplaces, Foundation, FullBath, Functional, GarageArea,
##     GarageCars, GarageCond, GarageFinish, GarageQual, GarageType,
##     GarageYrBlt, GrLivArea, HalfBath, Heating, HeatingQC, HouseStyle,
##     Id, KitchenAbvGr, KitchenQual, LandContour, LandSlope,
##     logSalePrice, LotArea, LotConfig, LotFrontage, LotShape,
##     LowQualFinSF, MasVnrArea, MasVnrType, MiscFeature, MiscVal, MoSold,
##     MSSubClass, MSZoning, Neighborhood, OpenPorchSF, OverallCond,
##     OverallQual, PavedDrive, PoolArea, PoolQC, RoofMatl, RoofStyle,
##     SaleCondition, SalePrice, SaleType, ScreenPorch, Street,
##     TotalBsmtSF, TotBath, TotRmsAbvGrd, Utilities, WoodDeckSF,
##     X1stFlrSF, X2ndFlrSF, X3SsnPorch, YearBuilt, YearRemodAdd, YrSold
## The following objects are masked from train (pos = 7):
## 
##     Alley, BedroomAbvGr, BldgType, BsmtCond, BsmtExposure, BsmtFinSF1,
##     BsmtFinSF2, BsmtFinType1, BsmtFinType2, BsmtFullBath, BsmtHalfBath,
##     BsmtLivArea, BsmtQual, BsmtUnfSF, CentralAir, Condition1,
##     Condition2, DwellType, Electrical, EnclosedPorch, ExterCond,
##     Exterior1st, Exterior2nd, ExterQual, Fence, FireplaceQu,
##     Fireplaces, Foundation, FullBath, Functional, GarageArea,
##     GarageCars, GarageCond, GarageFinish, GarageQual, GarageType,
##     GarageYrBlt, GrLivArea, HalfBath, Heating, HeatingQC, HouseStyle,
##     Id, KitchenAbvGr, KitchenQual, LandContour, LandSlope,
##     logSalePrice, LotArea, LotConfig, LotFrontage, LotShape,
##     LowQualFinSF, MasVnrArea, MasVnrType, MiscFeature, MiscVal, MoSold,
##     MSSubClass, MSZoning, Neighborhood, OpenPorchSF, OverallCond,
##     OverallQual, PavedDrive, PoolArea, PoolQC, RoofMatl, RoofStyle,
##     SaleCondition, SalePrice, SaleType, ScreenPorch, Street,
##     TotalBsmtSF, TotBath, TotRmsAbvGrd, Utilities, WoodDeckSF,
##     X1stFlrSF, X2ndFlrSF, X3SsnPorch, YearBuilt, YearRemodAdd, YrSold
model <- lm(logSalePrice ~ DwellType + Neighborhood+OverallQual+GrLivArea+BsmtLivArea+TotBath+Exterior1st+Condition1+YearRemodAdd*YearRemodAdd+OverallCond*OverallCond+SaleCondition, data = train)
model
## 
## Call:
## lm(formula = logSalePrice ~ DwellType + Neighborhood + OverallQual + 
##     GrLivArea + BsmtLivArea + TotBath + Exterior1st + Condition1 + 
##     YearRemodAdd * YearRemodAdd + OverallCond * OverallCond + 
##     SaleCondition, data = train)
## 
## Coefficients:
##            (Intercept)  DwellType1.5StoryUnfin       DwellType1SPUDNew  
##              8.662e+00              -2.832e-02              -1.686e-02  
##     DwellType1StoryAtt      DwellType1StoryNew      DwellType1StoryOld  
##             -2.894e-02               8.233e-02              -4.640e-02  
##      DwellType2.5Story       DwellType2_1.5PUD           DwellType2Fam  
##             -2.832e-02              -6.503e-02              -5.404e-02  
##       DwellType2PUDNew      DwellType2StoryNew      DwellType2StoryOld  
##             -1.939e-01              -1.547e-02              -2.803e-02  
##        DwellTypeDuplex       DwellTypeMultiLev       DwellTypeSplitFoy  
##             -3.367e-02               5.605e-02               8.440e-02  
##    NeighborhoodBlueste      NeighborhoodBrDale     NeighborhoodBrkSide  
##             -1.001e-01              -1.637e-01              -1.392e-01  
##    NeighborhoodClearCr     NeighborhoodCollgCr     NeighborhoodCrawfor  
##              4.587e-02              -4.984e-02               2.682e-03  
##    NeighborhoodEdwards     NeighborhoodGilbert      NeighborhoodIDOTRR  
##             -1.996e-01              -5.520e-02              -2.882e-01  
##    NeighborhoodMeadowV     NeighborhoodMitchel       NeighborhoodNAmes  
##             -2.432e-01              -9.805e-02              -1.297e-01  
##    NeighborhoodNoRidge     NeighborhoodNPkVill     NeighborhoodNridgHt  
##              7.303e-02              -6.421e-02               1.246e-01  
##     NeighborhoodNWAmes     NeighborhoodOldTown      NeighborhoodSawyer  
##             -1.096e-01              -2.314e-01              -1.237e-01  
##    NeighborhoodSawyerW     NeighborhoodSomerst     NeighborhoodStoneBr  
##             -6.516e-02               4.235e-02               1.406e-01  
##      NeighborhoodSWISU      NeighborhoodTimber     NeighborhoodVeenker  
##             -1.944e-01               7.153e-03               4.235e-02  
##            OverallQual               GrLivArea             BsmtLivArea  
##              8.909e-02               2.517e-04               5.422e-05  
##                TotBath      Exterior1stAsphShn      Exterior1stBrkComm  
##              5.881e-02              -1.410e-01              -2.023e-01  
##     Exterior1stBrkFace       Exterior1stCBlock      Exterior1stCemntBd  
##              1.282e-01               1.494e-01               8.849e-02  
##     Exterior1stHdBoard      Exterior1stImStucc      Exterior1stMetalSd  
##              4.197e-02               9.307e-03               7.807e-02  
##     Exterior1stPlywood        Exterior1stStone       Exterior1stStucco  
##              7.945e-02               7.484e-02              -2.631e-03  
##     Exterior1stVinylSd      Exterior1stWd Sdng      Exterior1stWdShing  
##              8.011e-02               3.789e-02               5.024e-02  
##        Condition1Feedr          Condition1Norm          Condition1PosA  
##             -2.553e-02               6.530e-02               4.940e-02  
##         Condition1PosN          Condition1RRAe          Condition1RRAn  
##              2.177e-02              -3.485e-02               6.264e-02  
##         Condition1RRNe          Condition1RRNn            YearRemodAdd  
##              7.777e-02               7.991e-02               9.768e-04  
##            OverallCond    SaleConditionAdjLand     SaleConditionAlloca  
##              3.455e-02               2.286e-02               4.233e-02  
##    SaleConditionFamily     SaleConditionNormal    SaleConditionPartial  
##              4.328e-02               8.621e-02               1.293e-01
summary(model)
## 
## Call:
## lm(formula = logSalePrice ~ DwellType + Neighborhood + OverallQual + 
##     GrLivArea + BsmtLivArea + TotBath + Exterior1st + Condition1 + 
##     YearRemodAdd * YearRemodAdd + OverallCond * OverallCond + 
##     SaleCondition, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.60988 -0.06611  0.00564  0.07088  0.45092 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             8.662e+00  5.437e-01  15.930  < 2e-16 ***
## DwellType1.5StoryUnfin -2.832e-02  4.401e-02  -0.643 0.520056    
## DwellType1SPUDNew      -1.686e-02  2.721e-02  -0.619 0.535761    
## DwellType1StoryAtt     -2.894e-02  7.317e-02  -0.396 0.692514    
## DwellType1StoryNew      8.233e-02  1.881e-02   4.377 1.29e-05 ***
## DwellType1StoryOld     -4.640e-02  2.265e-02  -2.049 0.040665 *  
## DwellType2.5Story      -2.832e-02  3.933e-02  -0.720 0.471505    
## DwellType2_1.5PUD      -6.503e-02  5.835e-02  -1.114 0.265318    
## DwellType2Fam          -5.404e-02  2.923e-02  -1.849 0.064694 .  
## DwellType2PUDNew       -1.939e-01  3.198e-02  -6.061 1.74e-09 ***
## DwellType2StoryNew     -1.547e-02  2.100e-02  -0.737 0.461487    
## DwellType2StoryOld     -2.803e-02  2.334e-02  -1.201 0.230067    
## DwellTypeDuplex        -3.367e-02  2.698e-02  -1.248 0.212220    
## DwellTypeMultiLev       5.605e-02  2.590e-02   2.164 0.030655 *  
## DwellTypeSplitFoy       8.440e-02  3.726e-02   2.265 0.023671 *  
## NeighborhoodBlueste    -1.001e-01  1.108e-01  -0.903 0.366460    
## NeighborhoodBrDale     -1.637e-01  5.971e-02  -2.742 0.006183 ** 
## NeighborhoodBrkSide    -1.392e-01  4.793e-02  -2.904 0.003744 ** 
## NeighborhoodClearCr     4.587e-02  4.934e-02   0.930 0.352733    
## NeighborhoodCollgCr    -4.984e-02  4.069e-02  -1.225 0.220881    
## NeighborhoodCrawfor     2.682e-03  4.696e-02   0.057 0.954463    
## NeighborhoodEdwards    -1.996e-01  4.396e-02  -4.541 6.09e-06 ***
## NeighborhoodGilbert    -5.520e-02  4.324e-02  -1.277 0.201932    
## NeighborhoodIDOTRR     -2.882e-01  4.994e-02  -5.770 9.76e-09 ***
## NeighborhoodMeadowV    -2.432e-01  6.506e-02  -3.739 0.000193 ***
## NeighborhoodMitchel    -9.805e-02  4.515e-02  -2.172 0.030043 *  
## NeighborhoodNAmes      -1.297e-01  4.275e-02  -3.033 0.002465 ** 
## NeighborhoodNoRidge     7.303e-02  4.659e-02   1.567 0.117274    
## NeighborhoodNPkVill    -6.421e-02  6.385e-02  -1.006 0.314719    
## NeighborhoodNridgHt     1.246e-01  4.086e-02   3.050 0.002330 ** 
## NeighborhoodNWAmes     -1.096e-01  4.476e-02  -2.448 0.014474 *  
## NeighborhoodOldTown    -2.314e-01  4.528e-02  -5.110 3.67e-07 ***
## NeighborhoodSawyer     -1.237e-01  4.509e-02  -2.743 0.006160 ** 
## NeighborhoodSawyerW    -6.516e-02  4.414e-02  -1.476 0.140154    
## NeighborhoodSomerst     4.235e-02  4.221e-02   1.003 0.315898    
## NeighborhoodStoneBr     1.406e-01  4.690e-02   2.998 0.002763 ** 
## NeighborhoodSWISU      -1.944e-01  5.222e-02  -3.722 0.000206 ***
## NeighborhoodTimber      7.153e-03  4.600e-02   0.156 0.876443    
## NeighborhoodVeenker     4.235e-02  5.913e-02   0.716 0.474014    
## OverallQual             8.909e-02  4.869e-03  18.298  < 2e-16 ***
## GrLivArea               2.517e-04  1.312e-05  19.180  < 2e-16 ***
## BsmtLivArea             5.422e-05  1.060e-05   5.116 3.56e-07 ***
## TotBath                 5.881e-02  7.319e-03   8.036 1.96e-15 ***
## Exterior1stAsphShn     -1.410e-01  1.477e-01  -0.954 0.340079    
## Exterior1stBrkComm     -2.023e-01  1.071e-01  -1.889 0.059094 .  
## Exterior1stBrkFace      1.282e-01  3.931e-02   3.261 0.001139 ** 
## Exterior1stCBlock       1.494e-01  1.477e-01   1.011 0.311981    
## Exterior1stCemntBd      8.849e-02  4.145e-02   2.135 0.032953 *  
## Exterior1stHdBoard      4.197e-02  3.552e-02   1.182 0.237598    
## Exterior1stImStucc      9.307e-03  1.482e-01   0.063 0.949941    
## Exterior1stMetalSd      7.807e-02  3.443e-02   2.267 0.023524 *  
## Exterior1stPlywood      7.945e-02  3.725e-02   2.133 0.033100 *  
## Exterior1stStone        7.484e-02  1.089e-01   0.687 0.492039    
## Exterior1stStucco      -2.631e-03  4.425e-02  -0.059 0.952595    
## Exterior1stVinylSd      8.011e-02  3.516e-02   2.278 0.022858 *  
## Exterior1stWd Sdng      3.789e-02  3.444e-02   1.100 0.271449    
## Exterior1stWdShing      5.024e-02  4.364e-02   1.151 0.249895    
## Condition1Feedr        -2.553e-02  2.776e-02  -0.920 0.357941    
## Condition1Norm          6.530e-02  2.300e-02   2.839 0.004590 ** 
## Condition1PosA          4.940e-02  5.670e-02   0.871 0.383756    
## Condition1PosN          2.177e-02  4.067e-02   0.535 0.592548    
## Condition1RRAe         -3.485e-02  5.050e-02  -0.690 0.490239    
## Condition1RRAn          6.264e-02  3.757e-02   1.667 0.095694 .  
## Condition1RRNe          7.777e-02  1.053e-01   0.739 0.460270    
## Condition1RRNn          7.991e-02  6.871e-02   1.163 0.245010    
## YearRemodAdd            9.768e-04  2.830e-04   3.452 0.000573 ***
## OverallCond             3.455e-02  4.178e-03   8.270 3.11e-16 ***
## SaleConditionAdjLand    2.286e-02  7.564e-02   0.302 0.762534    
## SaleConditionAlloca     4.233e-02  4.627e-02   0.915 0.360427    
## SaleConditionFamily     4.328e-02  3.547e-02   1.220 0.222541    
## SaleConditionNormal     8.621e-02  1.526e-02   5.650 1.94e-08 ***
## SaleConditionPartial    1.293e-01  2.150e-02   6.015 2.30e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1417 on 1388 degrees of freedom
## Multiple R-squared:  0.8802, Adjusted R-squared:  0.8741 
## F-statistic: 143.7 on 71 and 1388 DF,  p-value: < 2.2e-16
plot(fitted(model),resid(model))

#residuals are uniformly scattered, no patters, so #ok
qqnorm(resid(model))

#QQ plot is ok, however, may investigate points on both ends

# here is some code that gives more diagnostic #plots
model.diag.metrics <- augment(model)
head(model.diag.metrics)
## # A tibble: 6 x 18
##   logSalePrice DwellType Neighborhood OverallQual GrLivArea BsmtLivArea TotBath
##          <dbl> <chr>     <chr>              <int>     <int>       <int>   <int>
## 1         12.2 2StoryNew CollgCr                7      1710         706       4
## 2         12.1 1StoryNew Veenker                6      1262         978       3
## 3         12.3 2StoryNew CollgCr                7      1786         486       4
## 4         11.8 2StoryOld Crawfor                7      1717         216       2
## 5         12.4 2StoryNew NoRidge                8      2198         655       4
## 6         11.9 1.5Story~ Mitchel                5      1362         732       3
## # ... with 11 more variables: Exterior1st <chr>, Condition1 <chr>,
## #   YearRemodAdd <int>, OverallCond <int>, SaleCondition <chr>, .fitted <dbl>,
## #   .resid <dbl>, .hat <dbl>, .sigma <dbl>, .cooksd <dbl>, .std.resid <dbl>
autoplot(model)
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 5 row(s) containing missing values (geom_path).

#the shows no pattern in residual v fitted, so ok
#normal QQ plot is ok left end points concern

#Leverage - may investigate the 2 leverage points

Predict using test dataset

test1<-"C:/Users/Lisa/Documents/CUNY/605/Final/Kaggle/test.csv"

test<-read.table(file=test1,header=TRUE, sep=",")

#test$BsmtFinSF1 %>% 
 # convert(int(test$BsmtFinSF1))


test$BsmtFinSF1[is.na(test$BsmtFinSF1)]<-0
test$BsmtFinSF2[is.na(test$BsmtFinSF2)]<-0
test$FullBath[is.na(test$FullBath)]<-0
test$HalfBath[is.na(test$HalfBath)]<-0
test$BsmtFullBath[is.na(test$BsmtFullBath)]<-0
test$BsmtHalfBath[is.na(test$BsmtHalfBath)]<-0

test<-test %>% mutate(BsmtLivArea=BsmtFinSF1+BsmtFinSF2,TotBath=FullBath+HalfBath+BsmtFullBath+BsmtHalfBath)

#Impute the NA in Exterior1st based on highest #number/age


test <- test %>%
  mutate(Exterior1st = coalesce(Exterior1st, "MetalSd"))

head(test)             
##     Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1 1461         20       RH          80   11622   Pave  <NA>      Reg
## 2 1462         20       RL          81   14267   Pave  <NA>      IR1
## 3 1463         60       RL          74   13830   Pave  <NA>      IR1
## 4 1464         60       RL          78    9978   Pave  <NA>      IR1
## 5 1465        120       RL          43    5005   Pave  <NA>      IR1
## 6 1466         60       RL          75   10000   Pave  <NA>      IR1
##   LandContour Utilities LotConfig LandSlope Neighborhood Condition1 Condition2
## 1         Lvl    AllPub    Inside       Gtl        NAmes      Feedr       Norm
## 2         Lvl    AllPub    Corner       Gtl        NAmes       Norm       Norm
## 3         Lvl    AllPub    Inside       Gtl      Gilbert       Norm       Norm
## 4         Lvl    AllPub    Inside       Gtl      Gilbert       Norm       Norm
## 5         HLS    AllPub    Inside       Gtl      StoneBr       Norm       Norm
## 6         Lvl    AllPub    Corner       Gtl      Gilbert       Norm       Norm
##   BldgType HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle
## 1     1Fam     1Story           5           6      1961         1961     Gable
## 2     1Fam     1Story           6           6      1958         1958       Hip
## 3     1Fam     2Story           5           5      1997         1998     Gable
## 4     1Fam     2Story           6           6      1998         1998     Gable
## 5   TwnhsE     1Story           8           5      1992         1992     Gable
## 6     1Fam     2Story           6           5      1993         1994     Gable
##   RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond
## 1  CompShg     VinylSd     VinylSd       None          0        TA        TA
## 2  CompShg     Wd Sdng     Wd Sdng    BrkFace        108        TA        TA
## 3  CompShg     VinylSd     VinylSd       None          0        TA        TA
## 4  CompShg     VinylSd     VinylSd    BrkFace         20        TA        TA
## 5  CompShg     HdBoard     HdBoard       None          0        Gd        TA
## 6  CompShg     HdBoard     HdBoard       None          0        TA        TA
##   Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1
## 1     CBlock       TA       TA           No          Rec        468
## 2     CBlock       TA       TA           No          ALQ        923
## 3      PConc       Gd       TA           No          GLQ        791
## 4      PConc       TA       TA           No          GLQ        602
## 5      PConc       Gd       TA           No          ALQ        263
## 6      PConc       Gd       TA           No          Unf          0
##   BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir
## 1          LwQ        144       270         882    GasA        TA          Y
## 2          Unf          0       406        1329    GasA        TA          Y
## 3          Unf          0       137         928    GasA        Gd          Y
## 4          Unf          0       324         926    GasA        Ex          Y
## 5          Unf          0      1017        1280    GasA        Ex          Y
## 6          Unf          0       763         763    GasA        Gd          Y
##   Electrical X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## 1      SBrkr       896         0            0       896            0
## 2      SBrkr      1329         0            0      1329            0
## 3      SBrkr       928       701            0      1629            0
## 4      SBrkr       926       678            0      1604            0
## 5      SBrkr      1280         0            0      1280            0
## 6      SBrkr       763       892            0      1655            0
##   BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual
## 1            0        1        0            2            1          TA
## 2            0        1        1            3            1          Gd
## 3            0        2        1            3            1          TA
## 4            0        2        1            3            1          Gd
## 5            0        2        0            2            1          Gd
## 6            0        2        1            3            1          TA
##   TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType GarageYrBlt
## 1            5        Typ          0        <NA>     Attchd        1961
## 2            6        Typ          0        <NA>     Attchd        1958
## 3            6        Typ          1          TA     Attchd        1997
## 4            7        Typ          1          Gd     Attchd        1998
## 5            5        Typ          0        <NA>     Attchd        1992
## 6            7        Typ          1          TA     Attchd        1993
##   GarageFinish GarageCars GarageArea GarageQual GarageCond PavedDrive
## 1          Unf          1        730         TA         TA          Y
## 2          Unf          1        312         TA         TA          Y
## 3          Fin          2        482         TA         TA          Y
## 4          Fin          2        470         TA         TA          Y
## 5          RFn          2        506         TA         TA          Y
## 6          Fin          2        440         TA         TA          Y
##   WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC
## 1        140           0             0          0         120        0   <NA>
## 2        393          36             0          0           0        0   <NA>
## 3        212          34             0          0           0        0   <NA>
## 4        360          36             0          0           0        0   <NA>
## 5          0          82             0          0         144        0   <NA>
## 6        157          84             0          0           0        0   <NA>
##   Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition BsmtLivArea
## 1 MnPrv        <NA>       0      6   2010       WD        Normal         612
## 2  <NA>        Gar2   12500      6   2010       WD        Normal         923
## 3 MnPrv        <NA>       0      3   2010       WD        Normal         791
## 4  <NA>        <NA>       0      6   2010       WD        Normal         602
## 5  <NA>        <NA>       0      1   2010       WD        Normal         263
## 6  <NA>        <NA>       0      4   2010       WD        Normal           0
##   TotBath
## 1       1
## 2       2
## 3       3
## 4       3
## 5       2
## 6       3
test<-test %>% mutate(DwellType = case_when(MSSubClass==20 ~ '1StoryNew',
         MSSubClass == 30 ~ '1StoryOld',
         MSSubClass == 40 ~ '1StoryAtt',
         MSSubClass == 45 ~ '1.5StoryUnfin',
         MSSubClass == 50 ~ '1.5StoryFin',
         MSSubClass == 60 ~ '2StoryNew',
         MSSubClass == 70 ~ '2StoryOld',
         MSSubClass == 75 ~ '2.5Story',
         MSSubClass == 80 ~ 'MultiLev',
         MSSubClass == 85 ~ 'SplitFoy',
         MSSubClass == 90 ~ 'Duplex',
         MSSubClass == 120 ~ '1SPUDNew',
         MSSubClass == 150 | MSSubClass == 180 ~ '2_1.5PUD',
         MSSubClass == 160 ~ '2PUDNew',
         MSSubClass == 190 ~ '2Fam')) 

model$xlevels[["DwellType"]] <- union(model$xlevels[["DwellType"]], levels(test$DwellType))

pred1 <- predict(model, test)


kaggle<- as.data.frame(cbind(test$Id, pred1))

kaggle<-kaggle %>% mutate(pred=exp(pred1))

colnames(kaggle) 
## [1] "V1"    "pred1" "pred"
kaggle1<-kaggle[,c("V1","pred")]

head(kaggle1)
##     V1     pred
## 1 1461 113647.5
## 2 1462 156415.0
## 3 1463 166332.9
## 4 1464 185136.0
## 5 1465 211964.0
## 6 1466 168119.9
colnames(kaggle1) <- c("Id", "SalePrice")

write.csv(kaggle1, file="C:/Users/Lisa/Documents/CUNY/605/Final/Kaggle_Submission1.csv", quote=FALSE, row.names=FALSE)


#pred1 <- predict(model1, test1)
#test<-test %>% mutate(logSalePrice=log(SalePrice))

Kaggle: Elisa Szydziak score .14419