library(Matrix)
library(MASS)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Descriptive and Inferential Statistics

The two independent variables I will choose are: GRLivArea (above ground living area in square feet) and LotFrontage(Linear feet of street connected to property). The dependent variable I will choose is Sale price.

Provide uni variate descriptive statistics and appropriate plots for the training data set. Provide a scatter plot matrix for at least two of the independent variables and the dependent variable.

# Load the training data set
training_set = read.csv("https://raw.githubusercontent.com/GitHub-Vlad/Data-Science-Projects/main/House%20Prices%20-%20Advanced%20Regression%20Techniques%20(Kaggle%20competition)/train.csv")

GRLivArea = training_set$GrLivArea
LotFrontage = training_set$LotFrontage
SalePrice =  training_set$SalePrice


#Descriptive statistics for "above ground living area"
summary(GRLivArea)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     334    1130    1464    1515    1777    5642
#Histogram for "above ground living area"
hist(GRLivArea, breaks = 10, main = "Above Ground Living Area")

#Descriptive statistics for "Linear feet of street connected to property"
summary(LotFrontage)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   21.00   59.00   69.00   70.05   80.00  313.00     259
#Histogram for "Linear feet of street connected to property"
hist(LotFrontage, breaks = 10, main = "Linear feet of street connected to property")

#Descriptive statistics for "Sale Price"
summary(SalePrice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129975  163000  180921  214000  755000
#Histogram for "Sale Price"
hist(SalePrice, breaks = 10, main = "Sale Price")

#Scatter plot matrix of GRLivArea and LotFrontage versus SalePrice
par(mfrow=c(1,2))
plot(GRLivArea, SalePrice , xlab="Above Ground Living Area", ylab="SalePrice", main="Living Area vs SalePrice")
plot(LotFrontage, SalePrice, xlab="Linear feet of street connected to property", ylab="SalePrice", main="Street Connected vs SalePrice")

Analysis: It is clear that there is a strong correlation between “ground living area” and Sale price. The more the square footage of of above “ground living area” the higher the price. That is no surprise because the bigger the living area the more you will need to pay. However, there is also a correlation between “Linear feet of street connected to property” and sales price. The correlation is not as strong as with the “above ground living area” variable because some houses might have more/useful linear street feet connected top property and it will cost more…. for others it might be marginal and not factor into the Sale Price as much. Its not like living space.

Derive a correlation matrix for any three quantitative variables in the dataset.

library(corrplot)
## corrplot 0.92 loaded
GRLivArea = training_set$GrLivArea
LotFrontage = training_set$LotFrontage
SalePrice =  training_set$SalePrice

df = training_set[c("GrLivArea", "LotFrontage", "SalePrice")]
correlation_Matrix= cor(df, use = "complete.obs")
print(correlation_Matrix)
##             GrLivArea LotFrontage SalePrice
## GrLivArea   1.0000000   0.4027974 0.7035566
## LotFrontage 0.4027974   1.0000000 0.3517991
## SalePrice   0.7035566   0.3517991 1.0000000
#Plotting a correlation matrix
corrplot(correlation_Matrix, method = "square")

Analysis: From the correlation matrix, we can conclude that the sale price has a strong correlation with the “Above Ground living area” variable and a weak correlation with the “Linear feet of street connected to property” variable. We also see that there is a weak correlation between the “Above Ground living area” and “Linear feet of street connected to property”.

Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

# Performing t-test to find the correlation between "Above Ground living area" and Sale Price
t.test(GRLivArea , SalePrice, conf.level = 0.80)
## 
##  Welch Two Sample t-test
## 
## data:  GRLivArea and SalePrice
## t = -86.288, df = 1459.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 80 percent confidence interval:
##  -182071.5 -176740.0
## sample estimates:
##  mean of x  mean of y 
##   1515.464 180921.196
# Performing t-test to find the correlation between "Linear feet of street connected to property" and Sale Price
t.test(LotFrontage, SalePrice, conf.level = 0.80)
## 
##  Welch Two Sample t-test
## 
## data:  LotFrontage and SalePrice
## t = -86.985, df = 1459, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 80 percent confidence interval:
##  -183516.8 -178185.5
## sample estimates:
##    mean of x    mean of y 
##     70.04996 180921.19589

Analysis:

Since for both the “Above Ground living area” and Sale Price” and “Linear feet of street connected to property” and SalePrice” the p-value (2.2e-16) of the analysis is way below the significant value 0.05, we can reject the null hypothesis and in fact say that there is a correlation between sale price and “Above Ground living area” and between “linear feet of street connected to property” and “SalePrice”.in turn making the correlations between both independent variables and the sales price not 0. I would be worried about a familywise error. The reason being is that we are working with multiple variables and that would increase the chance of making false discoveries especially when performing multiple hypotheses tests.

Linear Algebra and Correlation:

Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.)

#inverting the correlation matrix from above
precision_Matrix = solve(correlation_Matrix)
print(precision_Matrix)
##              GrLivArea LotFrontage  SalePrice
## GrLivArea    2.0942930  -0.3711507 -1.3428832
## LotFrontage -0.3711507   1.2070186 -0.1635025
## SalePrice   -1.3428832  -0.1635025  2.0023145

Multiply the correlation matrix by the precision matrix

#multiplying the correlation matrix by the precision matrix
round((correlation_Matrix%*% precision_Matrix), 4)
##             GrLivArea LotFrontage SalePrice
## GrLivArea           1           0         0
## LotFrontage         0           1         0
## SalePrice           0           0         1

Multiply the precision matrix by the correlation matrix

#multiplying the precision matrix by the correlation matrix
round((precision_Matrix%*% correlation_Matrix), 4)
##             GrLivArea LotFrontage SalePrice
## GrLivArea           1           0         0
## LotFrontage         0           1         0
## SalePrice           0           0         1

Conduct LU decomposition on the correlation matrix (Calculating the lower triangular matrix)

Lower_correlation_Matrix = lu(correlation_Matrix )
Lower_correlation_Matrix = expand(Lower_correlation_Matrix)
LCM = Lower_correlation_Matrix$L
print(LCM)
## 3 x 3 Matrix of class "dtrMatrix" (unitriangular)
##      [,1]       [,2]       [,3]      
## [1,] 1.00000000          .          .
## [2,] 0.40279741 1.00000000          .
## [3,] 0.70355664 0.08165677 1.00000000

Conduct LU decomposition on the correlation matrix (Calculating the upper triangular matrix)

Upper_correlation_Matrix = lu(correlation_Matrix)
Upper_correlation_Matrix = expand(Upper_correlation_Matrix)
UCM = Upper_correlation_Matrix$U
print(UCM)
## 3 x 3 Matrix of class "dtrMatrix"
##      [,1]      [,2]      [,3]     
## [1,] 1.0000000 0.4027974 0.7035566
## [2,]         . 0.8377542 0.0684083
## [3,]         .         . 0.4994221

Conduct LU decomposition on the precision matrix (Calculating the lower triangular matrix)

Lower_precision_Matrix = lu(precision_Matrix )
Lower_precision_Matrix = expand(Lower_precision_Matrix )
LPM = Lower_precision_Matrix$L
print(LPM)
## 3 x 3 Matrix of class "dtrMatrix" (unitriangular)
##      [,1]       [,2]       [,3]      
## [1,]  1.0000000          .          .
## [2,] -0.1772200  1.0000000          .
## [3,] -0.6412108 -0.3517991  1.0000000

Conduct LU decomposition on the precision matrix (Calculating the upper triangular matrix)

Upper_precision_Matrix = lu(precision_Matrix )
Upper_precision_Matrix = expand(Upper_precision_Matrix )
UPM = Upper_precision_Matrix$U
print(UPM)
## 3 x 3 Matrix of class "dtrMatrix"
##      [,1]       [,2]       [,3]      
## [1,]  2.0942930 -0.3711507 -1.3428832
## [2,]          .  1.1412432 -0.4014883
## [3,]          .          .  1.0000000

Conduct LU decomposition and multiplying the lower and upper correlation_Matrix

LCM %*% UCM
## 3 x 3 Matrix of class "dgeMatrix"
##           [,1]      [,2]      [,3]
## [1,] 1.0000000 0.4027974 0.7035566
## [2,] 0.4027974 1.0000000 0.3517991
## [3,] 0.7035566 0.3517991 1.0000000

Conduct LU decomposition and multiplying the lower and upper precision_Matrix

LPM %*% UPM
## 3 x 3 Matrix of class "dgeMatrix"
##            [,1]       [,2]       [,3]
## [1,]  2.0942930 -0.3711507 -1.3428832
## [2,] -0.3711507  1.2070186 -0.1635025
## [3,] -1.3428832 -0.1635025  2.0023145

We can see, from above that in fact A = LU because when we multiply the Lower Triangular matrix by the Upper Triangular matrix of both the correlation and precision matrices respectively we get the original matrix for both.

Calculus-Based Probability & Statistics

Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Find the optimal value of λ for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, λ)). Plot a histogram and compare it with a histogram of your original variable

# I will now fit the "Above Ground living Area" variable data into an exponential function and find the rate (λ)
fitdistr(training_set$GrLivArea, "exponential")
##        rate    
##   6.598640e-04 
##  (1.726943e-05)
# Retrieving the rate(λ), I can now generate the exponential distribution needed to plot the Sample histogram
sample = rexp(1000, rate = 0.0006598640)

# plot the histogram of the "Above Ground living Area" variable from the training set and the one for the Sample data set (using exponential distribution)
par(mfrow=c(1,2))
hist_observed = hist(training_set$GrLivArea, breaks = 50, xlab = "Observed_Above_Ground_living_Area", main = "Observed")
hist_simulated = hist(sample, breaks = 50,xlab = "Simulated_Above_Ground_living_Area", main = "Sample")

Analysis: Based on the observed and sample visuals, we can see that the sample data is skewed more towards the right, whereas the observed data is concentrated more toward the center.

Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF)

quantile(sample, probs = c(0.05, 0.95))
##         5%        95% 
##   72.64699 4119.23318

Also generate a 95% confidence interval from the empirical data, assuming normality

#Finding the mean of the observed data
mean_observed = mean(training_set$GrLivArea)

#Finding the standard deviation of the observed data
standard_deviation_observed = sd(training_set$GrLivArea)

#Finding the standard error
standard_error_observed = qnorm(0.95) * standard_deviation_observed/sqrt(1000)

#Finding the interval from the left side of the mean
left_interval = mean_observed - standard_error_observed 

#Finding the interval from the right side of the mean
right_interval = mean_observed + standard_error_observed 

print(paste0('The 95% confidence interval for the imperical data lies between ', left_interval ,' and ', right_interval))
## [1] "The 95% confidence interval for the imperical data lies between 1488.13092120036 and 1542.79647605992"
#Visualizing this via histogram to assume normality
hist(rnorm(length(training_set$GrLivArea), mean_observed, standard_deviation_observed),xlab = "Observed_Above_Ground_living_Area", main = "Observed Normality")

Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

quantile(training_set$GrLivArea, probs = c(0.05, 0.95))
##     5%    95% 
##  848.0 2466.1

We can see from the calculation above that the lowest 5% of the observations are below 848 square feet and the highest 5% are above 2,466 square feet. We can thus conclude that 90% percent (95% - 5%) will be in this range.

Modeling

Modeling. Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score. Provide a screen snapshot of your score with your name identifiable.

Create a multiple linear regression model for the data set and retrieving the summary

#select only numeric columns from training set
training_set %>% select_if(is.numeric)->training_set
lm_sale_price <- lm(SalePrice ~ ., data = training_set)
summary(lm_sale_price)
## 
## Call:
## lm(formula = SalePrice ~ ., data = training_set)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -442182  -16955   -2824   15125  318183 
## 
## Coefficients: (2 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -3.351e+05  1.701e+06  -0.197 0.843909    
## Id            -1.205e+00  2.658e+00  -0.453 0.650332    
## MSSubClass    -2.001e+02  3.451e+01  -5.797 8.84e-09 ***
## LotFrontage   -1.160e+02  6.126e+01  -1.894 0.058503 .  
## LotArea        5.422e-01  1.575e-01   3.442 0.000599 ***
## OverallQual    1.866e+04  1.482e+03  12.592  < 2e-16 ***
## OverallCond    5.239e+03  1.368e+03   3.830 0.000135 ***
## YearBuilt      3.164e+02  8.766e+01   3.610 0.000321 ***
## YearRemodAdd   1.194e+02  8.668e+01   1.378 0.168607    
## MasVnrArea     3.141e+01  7.022e+00   4.473 8.54e-06 ***
## BsmtFinSF1     1.736e+01  5.838e+00   2.973 0.003014 ** 
## BsmtFinSF2     8.342e+00  8.766e+00   0.952 0.341532    
## BsmtUnfSF      5.005e+00  5.277e+00   0.948 0.343173    
## TotalBsmtSF           NA         NA      NA       NA    
## X1stFlrSF      4.597e+01  7.360e+00   6.246 6.02e-10 ***
## X2ndFlrSF      4.663e+01  6.102e+00   7.641 4.72e-14 ***
## LowQualFinSF   3.341e+01  2.794e+01   1.196 0.232009    
## GrLivArea             NA         NA      NA       NA    
## BsmtFullBath   9.043e+03  3.198e+03   2.828 0.004776 ** 
## BsmtHalfBath   2.465e+03  5.073e+03   0.486 0.627135    
## FullBath       5.433e+03  3.531e+03   1.539 0.124182    
## HalfBath      -1.098e+03  3.321e+03  -0.331 0.740945    
## BedroomAbvGr  -1.022e+04  2.155e+03  -4.742 2.40e-06 ***
## KitchenAbvGr  -2.202e+04  6.710e+03  -3.282 0.001063 ** 
## TotRmsAbvGrd   5.464e+03  1.487e+03   3.674 0.000251 ***
## Fireplaces     4.372e+03  2.189e+03   1.998 0.046020 *  
## GarageYrBlt   -4.728e+01  9.106e+01  -0.519 0.603742    
## GarageCars     1.685e+04  3.491e+03   4.827 1.58e-06 ***
## GarageArea     6.274e+00  1.213e+01   0.517 0.605002    
## WoodDeckSF     2.144e+01  1.002e+01   2.139 0.032662 *  
## OpenPorchSF   -2.252e+00  1.949e+01  -0.116 0.907998    
## EnclosedPorch  7.295e+00  2.062e+01   0.354 0.723590    
## X3SsnPorch     3.349e+01  3.758e+01   0.891 0.373163    
## ScreenPorch    5.805e+01  2.041e+01   2.844 0.004532 ** 
## PoolArea      -6.052e+01  2.990e+01  -2.024 0.043204 *  
## MiscVal       -3.761e+00  6.960e+00  -0.540 0.589016    
## MoSold        -2.217e+02  4.229e+02  -0.524 0.600188    
## YrSold        -2.474e+02  8.458e+02  -0.293 0.769917    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36800 on 1085 degrees of freedom
##   (339 observations deleted due to missingness)
## Multiple R-squared:  0.8096, Adjusted R-squared:  0.8034 
## F-statistic: 131.8 on 35 and 1085 DF,  p-value: < 2.2e-16

Looking at the summary multiple regression model built above, certain adjustments need to be made to ensure that it only contains those independent variables which are relevant in determining the dependent variable (Sale Price) in our case. Two steps need to be done to the data to prepare it for building a useful multiple regression model. The first step is to remove all N/A values. This will ensure variation in the model by removing all unique values. The second step would be to remove all independent variables whose p-value > 0.05. An independent variable with a p-value above 0.05 would prevent us from making any conclusions about the dependent variable (statistically insignificant) and thus reduce the efficacy of the model.

Create and plot a multiple Linear regression model for the adjusted data set and retrieving the summary

lm_sale_price_adjusted <- lm(SalePrice ~ MSSubClass + LotArea + OverallQual +
                      OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + 
                      BsmtFinSF1 + X1stFlrSF + X2ndFlrSF + BsmtFullBath + 
                      BedroomAbvGr + KitchenAbvGr +
                      TotRmsAbvGrd + Fireplaces + GarageCars + WoodDeckSF, 
                    data = training_set)

summary(lm_sale_price_adjusted)
## 
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotArea + OverallQual + 
##     OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 + 
##     X1stFlrSF + X2ndFlrSF + BsmtFullBath + BedroomAbvGr + KitchenAbvGr + 
##     TotRmsAbvGrd + Fireplaces + GarageCars + WoodDeckSF, data = training_set)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -487250  -16180   -2016   13330  285692 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.043e+06  1.157e+05  -9.016  < 2e-16 ***
## MSSubClass   -1.651e+02  2.583e+01  -6.393 2.20e-10 ***
## LotArea       4.223e-01  1.002e-01   4.215 2.66e-05 ***
## OverallQual   1.821e+04  1.139e+03  15.977  < 2e-16 ***
## OverallCond   4.017e+03  1.001e+03   4.012 6.32e-05 ***
## YearBuilt     3.062e+02  5.184e+01   5.907 4.34e-09 ***
## YearRemodAdd  1.929e+02  6.482e+01   2.976 0.002973 ** 
## MasVnrArea    3.232e+01  5.892e+00   5.486 4.87e-08 ***
## BsmtFinSF1    1.080e+01  2.983e+00   3.620 0.000304 ***
## X1stFlrSF     5.593e+01  4.632e+00  12.075  < 2e-16 ***
## X2ndFlrSF     4.715e+01  4.100e+00  11.501  < 2e-16 ***
## BsmtFullBath  8.545e+03  2.379e+03   3.591 0.000340 ***
## BedroomAbvGr -9.399e+03  1.665e+03  -5.645 1.99e-08 ***
## KitchenAbvGr -1.339e+04  5.097e+03  -2.627 0.008694 ** 
## TotRmsAbvGrd  5.166e+03  1.214e+03   4.255 2.22e-05 ***
## Fireplaces    3.938e+03  1.732e+03   2.273 0.023147 *  
## GarageCars    1.058e+04  1.693e+03   6.253 5.31e-10 ***
## WoodDeckSF    2.099e+01  7.857e+00   2.672 0.007630 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34830 on 1434 degrees of freedom
##   (8 observations deleted due to missingness)
## Multiple R-squared:  0.8093, Adjusted R-squared:  0.807 
## F-statistic:   358 on 17 and 1434 DF,  p-value: < 2.2e-16

Load the test.csv data set from kaggle and run the multiple Linear regression model to predict results

# Load the test dataset
testing_set <- read.csv("https://raw.githubusercontent.com/GitHub-Vlad/Data-Science-Projects/main/House%20Prices%20-%20Advanced%20Regression%20Techniques%20(Kaggle%20competition)/test.csv")

#Replace all N/A values with 0
testing_set[is.na(testing_set)] = 0
 
#run the adjusted multiple regression model created above on the test data set to retrieve the values of the predicted variable (SalePrice)
prediction <- predict(lm_sale_price_adjusted, testing_set)

#merge the test and prediction data frames on Id 
prediction <- data.frame(cbind(testing_set$Id, prediction))

#Retrieve the columns "Id" and "Sale Price" to the merged data frame
colnames(prediction) = c('Id', 'SalePrice')

#Export results to csv
#write.csv(prediction, "house_price_predictions.csv")