library(tidyverse)
library(GGally)
library(confintr)
library(MASS)
library(matlib)
library(matrixcalc)
library(Hmisc)
train <- as.data.frame(read.csv('hp_train.csv'))

Descriptive and Inferential Statistics. Provide univariate descriptive statistics and appropriate plots for the training data set.

#Remove Columns with NA values
train <- train[,colSums(is.na(train)) == 0]
head(train[,2:ncol(train)])
##   MSSubClass MSZoning LotArea Street LotShape LandContour Utilities LotConfig
## 1         60       RL    8450   Pave      Reg         Lvl    AllPub    Inside
## 2         20       RL    9600   Pave      Reg         Lvl    AllPub       FR2
## 3         60       RL   11250   Pave      IR1         Lvl    AllPub    Inside
## 4         70       RL    9550   Pave      IR1         Lvl    AllPub    Corner
## 5         60       RL   14260   Pave      IR1         Lvl    AllPub       FR2
## 6         50       RL   14115   Pave      IR1         Lvl    AllPub    Inside
##   LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle OverallQual
## 1       Gtl      CollgCr       Norm       Norm     1Fam     2Story           7
## 2       Gtl      Veenker      Feedr       Norm     1Fam     1Story           6
## 3       Gtl      CollgCr       Norm       Norm     1Fam     2Story           7
## 4       Gtl      Crawfor       Norm       Norm     1Fam     2Story           7
## 5       Gtl      NoRidge       Norm       Norm     1Fam     2Story           8
## 6       Gtl      Mitchel       Norm       Norm     1Fam     1.5Fin           5
##   OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd
## 1           5      2003         2003     Gable  CompShg     VinylSd     VinylSd
## 2           8      1976         1976     Gable  CompShg     MetalSd     MetalSd
## 3           5      2001         2002     Gable  CompShg     VinylSd     VinylSd
## 4           5      1915         1970     Gable  CompShg     Wd Sdng     Wd Shng
## 5           5      2000         2000     Gable  CompShg     VinylSd     VinylSd
## 6           5      1993         1995     Gable  CompShg     VinylSd     VinylSd
##   ExterQual ExterCond Foundation BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1        Gd        TA      PConc        706          0       150         856
## 2        TA        TA     CBlock        978          0       284        1262
## 3        Gd        TA      PConc        486          0       434         920
## 4        TA        TA     BrkTil        216          0       540         756
## 5        Gd        TA      PConc        655          0       490        1145
## 6        TA        TA       Wood        732          0        64         796
##   Heating HeatingQC CentralAir X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea
## 1    GasA        Ex          Y       856       854            0      1710
## 2    GasA        Ex          Y      1262         0            0      1262
## 3    GasA        Ex          Y       920       866            0      1786
## 4    GasA        Gd          Y       961       756            0      1717
## 5    GasA        Ex          Y      1145      1053            0      2198
## 6    GasA        Ex          Y       796       566            0      1362
##   BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr
## 1            1            0        2        1            3            1
## 2            0            1        2        0            3            1
## 3            1            0        2        1            3            1
## 4            1            0        1        0            3            1
## 5            1            0        2        1            4            1
## 6            1            0        1        1            1            1
##   KitchenQual TotRmsAbvGrd Functional Fireplaces GarageCars GarageArea
## 1          Gd            8        Typ          0          2        548
## 2          TA            6        Typ          1          2        460
## 3          Gd            6        Typ          1          2        608
## 4          Gd            7        Typ          1          3        642
## 5          Gd            9        Typ          1          3        836
## 6          TA            5        Typ          0          2        480
##   PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch ScreenPorch
## 1          Y          0          61             0          0           0
## 2          Y        298           0             0          0           0
## 3          Y          0          42             0          0           0
## 4          Y          0          35           272          0           0
## 5          Y        192          84             0          0           0
## 6          Y         40          30             0        320           0
##   PoolArea MiscVal MoSold YrSold SaleType SaleCondition SalePrice
## 1        0       0      2   2008       WD        Normal    208500
## 2        0       0      5   2007       WD        Normal    181500
## 3        0       0      9   2008       WD        Normal    223500
## 4        0       0      2   2006       WD       Abnorml    140000
## 5        0       0     12   2008       WD        Normal    250000
## 6        0     700     10   2009       WD        Normal    143000
# Data type of variables
type <- data.frame(Type = table(sapply(train, class)))

ggplot(type, aes(x = Type.Var1, y = Type.Freq)) +
    geom_bar(stat="identity", width = 0.25, color = 'black', fill='seagreen') + 
    labs(x = 'Type', y = 'Count')

sp <- ggplot(train,aes(SalePrice)) + geom_histogram(aes(y =..density..),bins = 50, fill = 'LightGreen', color = 'Black') + geom_density()
sp

Provide a scatterplot matrix for at least two of the independent variables and the dependent variable.

sub <- train %>% dplyr::select(SalePrice, PoolArea, LotArea)
ggpairs(sub)

Derive a correlation matrix for any three quantitative variables in the dataset.

sub2 <- train %>% dplyr::select(SalePrice, LotArea, OverallCond)
cm <- cor(sub2)
cm
##               SalePrice     LotArea OverallCond
## SalePrice    1.00000000  0.26384335 -0.07785589
## LotArea      0.26384335  1.00000000 -0.00563627
## OverallCond -0.07785589 -0.00563627  1.00000000

Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not? 5 points

sl <- cm[1,][2]
so <- cm[1,][3]
lo <- cm[2,][3]
n <- nrow(train)
ci <- c(0.1,0.9)

ci_1 <- ci_cor(sub2 %>% dplyr::select(c(1,2)), ci)
ci_2 <- ci_cor(sub2 %>% dplyr::select(c(1,3)), ci)
ci_3 <- ci_cor(sub2 %>% dplyr::select(c(2,3)), ci)
print('Sale Price v. Lot Area: ')
## [1] "Sale Price v. Lot Area: "
print(ci_1$interval)
## [1] 0.2154574 0.3109369
print('Sale Price v. Overall Condition: ')
## [1] "Sale Price v. Overall Condition: "
print(ci_2$interval)
## [1] -0.12864437 -0.02666008
print('Lot Area v. Overall Condition: ')
## [1] "Lot Area v. Overall Condition: "
print(ci_3$interval)
## [1] -0.05692212  0.04567924

Linear Algebra and Correlation. Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix. 5 points

lu_factor <- function(x)
{
  counter = 2
  pos = 1
  identity = seq(1,nrow(x)*ncol(x))
  for(r in seq(nrow(x)))
  {
    for(c in seq(ncol(x)))
    {
      if(r==c)
        {identity[pos] = 1}
      else
      {identity[pos] = 0}
      pos = pos + 1
    }
  }
  
  identity = as.matrix(matrix(identity, nrow = nrow(x), ncol=ncol(x), byrow = TRUE))
  
  for (c in seq(1,ncol(x)-1)) {
    
    for( r in seq(counter, nrow(x)))
    {
      factor = -(x[r,c]/x[counter-1,c])
      new_r = factor*x[counter-1,] 
      x[r,] = new_r + x[r,]
      
      identity[r,c] = -1*factor
    }
    counter = counter + 1
  }
  
  list(x, identity)
}
# Invert your correlation matrix from above.
pm <- inv(cm)
temp <- cm %*% pm
round(temp,1)
##                  
## SalePrice   1 0 0
## LotArea     0 1 0
## OverallCond 0 0 1
m <- temp %*% cm
m
##               SalePrice      LotArea  OverallCond
## SalePrice    1.00000000  0.263843349 -0.077855893
## LotArea      0.26384335  0.999999999 -0.005636267
## OverallCond -0.07785589 -0.005636267  0.999999997
#Conduct LU decomposition on the matrix
print(lu_factor(m))
## [[1]]
##             SalePrice   LotArea OverallCond
## SalePrice           1 0.2638433 -0.07785589
## LotArea             0 0.9303867  0.01490549
## OverallCond         0 0.0000000  0.99369966
## 
## [[2]]
##             [,1]       [,2] [,3]
## [1,]  1.00000000 0.00000000    0
## [2,]  0.26384335 1.00000000    0
## [3,] -0.07785589 0.01602075    1
lu.decomposition(m)
## $L
##             [,1]       [,2] [,3]
## [1,]  1.00000000 0.00000000    0
## [2,]  0.26384335 1.00000000    0
## [3,] -0.07785589 0.01602075    1
## 
## $U
##      [,1]      [,2]        [,3]
## [1,]    1 0.2638433 -0.07785589
## [2,]    0 0.9303867  0.01490549
## [3,]    0 0.0000000  0.99369966

Calculus-Based Probability & Statistics. Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary.

train_num <- select_if(train, is.numeric)
hist.data.frame(train_num)

ggplot(train_num, aes(BsmtFinSF1)) + geom_histogram(bins = 50, color = 'black', fill = 'lightpink')

Then load the MASS package and run fitdistr to fit an exponential probability density function. (See fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/Rdevel/library/MASS/html/fitdistr.html ). Find the optimal value of λ for this distribution ). Find the optimal value of λ for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, λ)). Plot a histogram and compare it with a histogram of your original variable.

data <- train_num$BsmtFinSF1
fit <- fitdistr(data,"exponential")
lambda <- fit$estimate
samples <- rexp(1000, lambda)
samples <- as.data.frame(samples)
ggplot(samples, aes(samples)) + geom_histogram(bins = 50, color = 'black', fill = 'lightblue')

Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF).

qexp(0.05, lambda)
## [1] 22.75574
qexp(0.95, lambda)
## [1] 1329.026
qexp(c(0.05,0.95), lambda)
## [1]   22.75574 1329.02585

Also generate a 95% confidence interval from the empirical data, assuming normality.

m <- mean(data)
sd <- sd(data)
n <- nrow(train_num)
se <- qnorm(0.975)*(sd/sqrt(n))
c(m-se,m+se)
## [1] 420.2444 467.0351

Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss. 10 points

quantile(train_num$GrLivArea, probs = c(0.05,0.95))
##     5%    95% 
##  848.0 2466.1

Modeling. Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com username and score. 10 points

test <- as.data.frame(read.csv('hp_test.csv'))
unique(is.na(test$LotArea))
## [1] FALSE
mrm <- lm(SalePrice ~ LotArea + YearBuilt + HouseStyle + OverallCond + Neighborhood, data = train )
mrm
## 
## Call:
## lm(formula = SalePrice ~ LotArea + YearBuilt + HouseStyle + OverallCond + 
##     Neighborhood, data = train)
## 
## Coefficients:
##         (Intercept)              LotArea            YearBuilt  
##          -1.997e+06            1.566e+00            1.077e+03  
##    HouseStyle1.5Unf     HouseStyle1Story     HouseStyle2.5Fin  
##          -2.470e+04           -1.595e+04            8.696e+04  
##    HouseStyle2.5Unf     HouseStyle2Story     HouseStyleSFoyer  
##           3.883e+04            6.567e+03           -2.754e+04  
##      HouseStyleSLvl          OverallCond  NeighborhoodBlueste  
##          -1.218e+04            8.641e+03           -5.858e+04  
##  NeighborhoodBrDale  NeighborhoodBrkSide  NeighborhoodClearCr  
##          -7.778e+04           -1.584e+04            1.871e+02  
## NeighborhoodCollgCr  NeighborhoodCrawfor  NeighborhoodEdwards  
##          -8.605e+03            4.287e+04           -3.439e+04  
## NeighborhoodGilbert   NeighborhoodIDOTRR  NeighborhoodMeadowV  
##          -2.572e+04           -3.452e+04           -7.076e+04  
## NeighborhoodMitchel    NeighborhoodNAmes  NeighborhoodNoRidge  
##          -3.125e+04           -2.093e+04            1.132e+05  
## NeighborhoodNPkVill  NeighborhoodNridgHt   NeighborhoodNWAmes  
##          -3.828e+04            1.016e+05           -4.682e+03  
## NeighborhoodOldTown   NeighborhoodSawyer  NeighborhoodSawyerW  
##          -1.448e+04           -3.330e+04           -1.103e+04  
## NeighborhoodSomerst  NeighborhoodStoneBr    NeighborhoodSWISU  
##           1.069e+04            1.052e+05           -1.192e+04  
##  NeighborhoodTimber  NeighborhoodVeenker  
##           2.128e+04            3.569e+04
predict <- predict(mrm, test)
predict_csv <- data.frame(Id = test$Id, SalePrice = predict)
head(predict_csv)
##     Id SalePrice
## 1 1461  147826.1
## 2 1462  148737.3
## 3 1463  199128.1
## 4 1464  202814.8
## 5 1465  288344.6
## 6 1466  188823.9
write.csv(predict_csv, 'Submission.csv',row.names = FALSE)

Kaggle Submission:

Username: KrutikaP Score: 0.22993