Load all the necessary Libraries

library(gridExtra)
library(RColorBrewer)
library(Matrix)
library(scales)
library(corrplot)
library(MASS)
library(psych)
library(ggplot2)
library(DataExplorer)
library(ggpmisc)

The following rmarkdown is for the final exam for Data 605

Problem 1

Using R, generate a random variable X that has 10,000 random uniform numbers from 1 to N, where N can be any number of your choosing greater than or equal to 6. Then generate a random variable Y that has 10,000 random normal numbers with a mean of:

\[\mu=\sigma=(N+1)/2\]

set.seed(123)

N = 10
n = 10000

# Random Uniform number generation
X = runif(n, 1, N)

# Random normal number generation
Y = rnorm(n, mean = (N+1)/2, sd = (N+1)/2)

df = data.frame(cbind(X, Y))


plot_histogram(df, title = "Uniform and Normal Distributions generated")


Probability

Assume the small letter “x” is estimated as the median of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable.

the median is calculated as:

# Median of X
(x = median(X))
## [1] 5.451109

the 25th percentile is calculated as:

# 1st Quantile of Y
(y = quantile(Y, 0.25))
##      25% 
## 1.840529

Calculate as a minimum the below probabilities a through c. Interpret the meaning of all probabilities.

a. P(X>x | X>y)

# Number of observations where X is greater than y, the 1st quantile of Y
(nXy = nrow(subset(df, X > y)))
## [1] 9075
# Probability of X is higher than x, the median of X given that X is greater than y, the 1st quantile of Y.
(pa = nrow(subset(df, X > x & X > y))/nXy)
## [1] 0.5509642

The probability is 55%

b. P(X>x, Y>y)

# Probability that both X and Y are greater than x, the median of X and y, the first quantile of Y respectively.
(pb = nrow(subset(df, X > x & Y > y))/n)
## [1] 0.3756

The probability is 37.6%

c. P(X<x | X>y)

# Probability that X is less that x, the median of X given that X is greater than Y
(pc = nrow(subset(df, X < x & X > y))/nXy)
## [1] 0.4490358

The probability is 44.9%


Investigate whether P(X>x and Y>y) = P(X > x) * P(Y > y) by building a table and evaluating the marginal and joint probabilities.

A1 <- c(sum(X <= x & Y <= y), sum(X > x & Y <= y))
B1 <- c(sum(X <= x & Y > y), sum(X > x & Y > y))

matrix1 <- matrix(c(A1, B1), nrow = 2)
matrix1 <- rbind(matrix1, apply(matrix1, 2, sum))
matrix1 <- cbind(matrix1, apply(matrix1, 1, sum))

xy <- c("less than 1st quartile", " greater than 1st quartile", "Total")
countDF <- data.frame(xy, matrix1)
colnames(countDF) <- c("x/y", "less than 1st quartile", "greater than 1st quartile", "Total")
print(countDF)
##                          x/y less than 1st quartile greater than 1st quartile
## 1     less than 1st quartile                   1256                      3744
## 2  greater than 1st quartile                   1244                      3756
## 3                      Total                   2500                      7500
##   Total
## 1  5000
## 2  5000
## 3 10000
A <- countDF[2, 4]
B <- countDF[3, 3]
A_B <- countDF[2, 3]
tot <- countDF[3, 4]

Prob_A <- A/tot
Prob_B <- B/tot
prob_A_B <- A_B/tot

print(prob_A_B)
## [1] 0.3756

So P(AB) = 0.3755

Prob_A_Prob_B = Prob_A * Prob_B

print(Prob_A_Prob_B)
## [1] 0.375

So, P(X>x and Y>y)=P(X>x)P(Y>y) is TRUE.


Check to see if independence holds by using Fisher’s Exact Test and the Chi Square Test.
What is the difference between the two? Which is most appropriate?

# Generate the matrix
M <- matrix(c(A1, B1), nrow = 2)

# Perform the Chi square test
chisq.test(M)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  M
## X-squared = 0.064533, df = 1, p-value = 0.7995
# Perform the Fisher's Exact Test
fisher.test(M, simulate.p.value=TRUE)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  M
## p-value = 0.7995
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.9242273 1.1100187
## sample estimates:
## odds ratio 
##   1.012883

Problem 2

You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques .
I want you to do the following.

The variable selected for the regression analysis include lotarea, saleprice and totalbsmntsf

#Training set
dftrain = read.csv("train.csv")

DataExplorer::introduce(dftrain)
##   rows columns discrete_columns continuous_columns all_missing_columns
## 1 1460      81               43                 38                   0
##   total_missing_values complete_rows total_observations memory_usage
## 1                 6965             0             118260       755280
DataExplorer::plot_str(dftrain)

# Select the variables to perform the various statistical operations
X = dftrain$LotArea
Y = dftrain$SalePrice
Z = dftrain$TotalBsmtSF

Descriptive and Inferential Statistics.

Provide univariate descriptive statistics and appropriate plots for the training data set.

Descriptive statistics on LotArea

# Descriptive statistics on LotArea
summary(X)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1300    7554    9478   10517   11602  215245
describe(X)
##    vars    n     mean      sd median trimmed     mad  min    max  range  skew
## X1    1 1460 10516.83 9981.26 9478.5 9563.28 2962.23 1300 215245 213945 12.18
##    kurtosis     se
## X1   202.26 261.22
# BoxPlot and Histograms for the LotArea
par(mfrow=c(1,2))
boxplot(X, main="LotArea BoxPlot")
hist(X, breaks = 20, main = "LotArea Histogram")


Descriptive statistics on SalePrice

# Descriptive statistics on SalePrice
summary(Y)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129975  163000  180921  214000  755000
describe(Y)
##    vars    n     mean      sd median  trimmed     mad   min    max  range skew
## X1    1 1460 180921.2 79442.5 163000 170783.3 56338.8 34900 755000 720100 1.88
##    kurtosis      se
## X1      6.5 2079.11
# BoxPlot and Histograms for the SalePrice
par(mfrow=c(1,2))
boxplot(Y, main="SalePrice BoxPlot")
hist(Y, breaks = 30, main = "SalePrice Histogram")


Descriptive statistics on GarageArea

# Descriptive statistics on GarageArea
summary(Z)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   795.8   991.5  1057.4  1298.2  6110.0
describe(Z)
##    vars    n    mean     sd median trimmed    mad min  max range skew kurtosis
## X1    1 1460 1057.43 438.71  991.5  1036.7 347.67   0 6110  6110 1.52    13.18
##       se
## X1 11.48

BoxPlot and Histograms for the Basementsqft

# BoxPlot and Histograms for the GarageArea
par(mfrow=c(1,2))
boxplot(Z, main="basementArea BoxPlot")
hist(Z, breaks = 30, main = "basementArea Histogram")


Provide a scatterplot matrix for at least two of the independent variables and the dependent variable.

# Scatterplot of LotArea and Basementsqft vs SalePrice
par(mfrow=c(1,2))
ggplot(data.frame(X,Y,Z), aes( x = X, y = Y))+
  geom_point()+theme_bw()+
  geom_smooth(method = "lm")+
  labs(x = "LotArea",
       y = "SalePrice")+
    stat_poly_eq(aes(label = paste(..adj.rr.label.., sep = "~~~")), 
                   label.x.npc = "right", label.y.npc = .9,
                   formula = y~x, parse = TRUE, size = 3.5)

ggplot(data.frame(X,Y,Z), aes( x = Z, y = Y))+
  geom_point()+theme_bw()+
  geom_smooth(method = "lm")+
  labs(x = "basementSqft",
       y = "SalePrice")+
      stat_poly_eq(aes(label = paste(..adj.rr.label.., sep = "~~~")), 
                   label.x.npc = "left", label.y.npc = .9,
                   formula = y~x, parse = TRUE, size = 3.5)

From the observation there seem to be a strong correlation between the basement square footage and the SalePrice but there seem to be a small correlation between the lot area and the SalePrice although it seems that outliers are present in there.


Derive a correlation matrix for any three quantitative variables in the dataset.

dfsub<- data.frame(LotArea = X, SalePrice = Y, BsmntSqFt=Z )

corMatrix = cor(dfsub, use = "complete.obs")

DataExplorer::plot_correlation(dfsub, ggtheme = theme_bw(),cor_args = list("use" = "pairwise.complete.obs"))

From the corellation matrix we can conclude that there exist strong to weak correlation between the three variables SalePrice has a strong correlation with basement and a weak correlation with LotArea with correlation coefficient of 0.61 and 0.26 respectively. LotArea and basementArea have weak correlation with each other with a correlation coefficieant of 0.26


Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval.

# Pairwise corelation between LotArea and SalePrice
cor.test(dfsub$LotArea, dfsub$SalePrice, method = 'pearson', conf.level = 0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  dfsub$LotArea and dfsub$SalePrice
## t = 10.445, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.2323391 0.2947946
## sample estimates:
##       cor 
## 0.2638434
# Pairwise corelation between Basementarea and SalePrice
cor.test(dfsub$BsmntSqFt, dfsub$SalePrice, method = 'pearson', conf.level = 0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  dfsub$BsmntSqFt and dfsub$SalePrice
## t = 29.671, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.5922142 0.6340846
## sample estimates:
##       cor 
## 0.6135806
# Pairwise corelation between LotArea and Basementarea
cor.test(dfsub$LotArea, dfsub$BsmntSqFt, method = 'pearson', conf.level = 0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  dfsub$LotArea and dfsub$BsmntSqFt
## t = 10.317, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.2292786 0.2918400
## sample estimates:
##       cor 
## 0.2608331

Discuss the meaning of your analysis.

The tests for pairwise correlation using pearson method estimated the association between the paired samples and computed a test of the value being zero. All the p-value are less than the significant level alpha = 0.8 and we thus conclude that the pairwise variables are correlated with respective correlation coefficient shown.

Would you be worried about familywise error? Why or why not?

Due to the fact that there are many variables in the train dataset that might have a huge impact on the correlation of the pairwise variables Yes i would be worried. Its possible to reject TRUE NULL Hypothesis unless all other variables are considered.


Linear Algebra and Correlation.

Invert your correlation matrix from above. This is known as the precision matrix and contains variance inflation factors on the diagonal.

# Co-variance martix from above
print(corMatrix)
##             LotArea SalePrice BsmntSqFt
## LotArea   1.0000000 0.2638434 0.2608331
## SalePrice 0.2638434 1.0000000 0.6135806
## BsmntSqFt 0.2608331 0.6135806 1.0000000
#To invert the Corelation Matrix use the solve() function
precision_matrix = solve(corMatrix)
print(precision_matrix)
##              LotArea  SalePrice  BsmntSqFt
## LotArea    1.0932718 -0.1820040 -0.1734874
## SalePrice -0.1820040  1.6341000 -0.9551793
## BsmntSqFt -0.1734874 -0.9551793  1.6313307

Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix.

# To multiply the correlation matrix by the precision matrix
round((corMatrix %*% precision_matrix), 4)
##           LotArea SalePrice BsmntSqFt
## LotArea         1         0         0
## SalePrice       0         1         0
## BsmntSqFt       0         0         1
# Multiply precision matrix by the correlation matrix
round((precision_matrix %*% corMatrix), 4)
##           LotArea SalePrice BsmntSqFt
## LotArea         1         0         0
## SalePrice       0         1         0
## BsmntSqFt       0         0         1

Both of the above operations produce an identity matrix.


Conduct LU decomposition on the matrix.

Correlation Matrix

# LU decomposition of the correlation matrix
lud_cor = lu(corMatrix)
elu_cor = expand(lud_cor)

#Lower triangular matrix
cor_L = elu_cor$L
print(cor_L)
## 3 x 3 Matrix of class "dtrMatrix" (unitriangular)
##      [,1]      [,2]      [,3]     
## [1,] 1.0000000         .         .
## [2,] 0.2638434 1.0000000         .
## [3,] 0.2608331 0.5855216 1.0000000
#Upper triangular matrix
cor_U = elu_cor$U
print(cor_U)
## 3 x 3 Matrix of class "dtrMatrix"
##      [,1]      [,2]      [,3]     
## [1,] 1.0000000 0.2638434 0.2608331
## [2,]         . 0.9303867 0.5447615
## [3,]         .         . 0.6129965

Precision Matrix

# LU decomposition of the precision matrix
lud_prec = lu(precision_matrix)
elu_prec = expand(lud_prec)

#Lower triangular matrix
prec_L = elu_prec$L
print(prec_L)
## 3 x 3 Matrix of class "dtrMatrix" (unitriangular)
##      [,1]       [,2]       [,3]      
## [1,]  1.0000000          .          .
## [2,] -0.1664765  1.0000000          .
## [3,] -0.1586864 -0.6135806  1.0000000
#Upper triangular matrix
prec_U = elu_prec$U
print(prec_U)
## 3 x 3 Matrix of class "dtrMatrix"
##      [,1]       [,2]       [,3]      
## [1,]  1.0932718 -0.1820040 -0.1734874
## [2,]          .  1.6038006 -0.9840609
## [3,]          .          .  1.0000000

Multiply the Upper triangular matrix with the lower triangular matrix

# Multiply cor_U with cor_L to get the original matrix since A = LU
cor_L %*% cor_U
## 3 x 3 Matrix of class "dgeMatrix"
##           [,1]      [,2]      [,3]
## [1,] 1.0000000 0.2638434 0.2608331
## [2,] 0.2638434 1.0000000 0.6135806
## [3,] 0.2608331 0.6135806 1.0000000
# Ditto prec_U with prec_L
prec_L %*% prec_U
## 3 x 3 Matrix of class "dgeMatrix"
##            [,1]       [,2]       [,3]
## [1,]  1.0932718 -0.1820040 -0.1734874
## [2,] -0.1820040  1.6341000 -0.9551793
## [3,] -0.1734874 -0.9551793  1.6313307

Multiplication of L & U returns their respective origina matrix as expected.


Calculus-Based Probability & Statistics.

Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary.

# We will use the GarageAre data. Check minimum and see if shifting is necessary.
z = dfsub$BsmntSqFt
min(z)
## [1] 0
hist(z, breaks = 50)


Then load the MASS package and run fitdistr to fit an exponential probability density function.
(See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ).

# Fit an exponential probability density function
fit_exp = fitdistr(z, densfun = "exponential")
print(fit_exp)
##        rate    
##   9.456896e-04 
##  (2.474983e-05)

Find the optimal value of lambda for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, lambda)).

# Get 1000 samples from the exponential distribution
fit_exp$estimate
##         rate 
## 0.0009456896
samp = rexp(1000, fit_exp$estimate)

Plot a histogram and compare it with a histogram of your original variable.

# Plot a histogram of the observed and the simulated data
par(mfrow=c(1,2))
hist(z, breaks = 100, xlab = "observed_basementarea", main = "Observed")
hist(samp, breaks = 100, xlab = "predicted_basementArea", main = "Simulated")

Visually from the histograms the simulated data is more heavily skewed to the right while the observed data is more concentrated to the centre.


Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF).

# Find the 5th and 95th percentile of the simulated sample data
quantile(samp, probs = c(0.05, 0.95))
##         5%        95% 
##   61.24837 2977.49949

Also generate a 95% confidence interval from the empirical data, assuming normality.

# Calculating the 95% confidence interval on the observed data
mean_z = mean(z)
n_z = nrow(z)
sd_z = sd(z)

se = qnorm(0.975) * sd_z/sqrt(n)

left_int = mean_z - se
print(left_int)
## [1] 1048.831
right_int = mean_z + se
print(right_int)
## [1] 1066.028

The 95% confidence interval lies between 468.79 and 477.17.


# We plot a histogram to show the assumed normality
normalitytransformed = rnorm(length(z), mean_z, sd_z)
hist(normalitytransformed)

Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

# Emperical 5th and 95th percentile of the observed data
quantile(z, probs = c(0.05, 0.95))
##     5%    95% 
##  519.3 1753.0

Modeling.

Build some type of multiple regression model and submit your model to the competition board.
Provide your complete model summary and results with analysis.

Training Data and Model Generation


Load the selected variables into a dataframe and perform data cleanup operations.

library(dplyr)
library(naniar)
library(tidyselect)
library(caret)
library(purrr)
library(tidymodels)

# Remove features with missing or limited data as evaluated from the summary statistics above.
plot_missing(dftrain,theme_config = theme_bw())

a<-miss_var_summary(dftrain) %>% filter(pct_miss>50) %>% select(variable) %>%  unlist() %>% as.vector()

with this we will remove the features that are sparse

plot_histogram(dftrain)

lm_spec<-linear_reg() %>%
  set_engine(engine = "lm")
set.seed(1)
split<-initial_split(dftrain,prop=.8)

df_train<-training(split)
df_test<-testing(split)



LMRecipe<- df_train %>% recipe(SalePrice~.) %>% 
  #remove all non numeric based variables
  step_rm(all_of(a)) %>% 
  step_other(all_nominal(),threshold = .05) %>% 
  step_dummy(all_nominal()) %>% 
  step_medianimpute(all_predictors()) %>%
  step_corr(all_predictors(),threshold = .9) %>% 
  #step_BoxCox(all_predictors()) %>% 
  step_nzv(all_predictors()) %>% 
  prep()

set.seed(1)
lm_res<-fit_resamples(lm_spec,
                      LMRecipe,
                      vfold_cv(df_train, v = 10),
                      control = control_resamples(save_pred = TRUE))

lm_res %>%
  collect_metrics()
## # A tibble: 2 x 6
##   .metric .estimator      mean     n   std_err .config             
##   <chr>   <chr>          <dbl> <int>     <dbl> <chr>               
## 1 rmse    standard   33073.        9 2874.     Preprocessor1_Model1
## 2 rsq     standard       0.834     9    0.0192 Preprocessor1_Model1
lm_res %>%
  unnest(.predictions) %>%
  ggplot(aes(SalePrice, .pred, color = id)) +
  facet_wrap(~id)+
  geom_abline(lty = 2, color = "gray80", size = 1.5) +
  geom_point(alpha = 0.5) +
  labs(
    x = "Truth",
    y = "Predicted Sale Price",
    color = NULL
  )+
  stat_poly_eq(aes(label = paste(..adj.rr.label.., sep = "~~~")), 
                   label.x.npc = "left", label.y.npc = .9,
                   formula = y~x, parse = TRUE, size = 4)+
  theme(panel.background = element_blank(),
        panel.grid = element_blank())

trainset<-
  bake(LMRecipe, new_data = df_train)

testset<-
  bake(LMRecipe, new_data = df_test)

model1<-lm_spec %>% 
  fit(SalePrice~.,data = trainset)
(mod1_sum <- summary(model1$fit))
## 
## Call:
## stats::lm(formula = SalePrice ~ ., data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -392371  -11744     525   10984  215977 
## 
## Coefficients: (1 not defined because of singularities)
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -2.772e+05  1.328e+06  -0.209 0.834665    
## Id                     2.044e+00  2.021e+00   1.011 0.312039    
## MSSubClass            -1.840e+01  6.565e+01  -0.280 0.779327    
## LotFrontage            6.841e+01  5.253e+01   1.302 0.193088    
## LotArea                3.464e-01  9.300e-02   3.725 0.000206 ***
## OverallQual            1.110e+04  1.202e+03   9.235  < 2e-16 ***
## OverallCond            6.256e+03  1.062e+03   5.893 5.08e-09 ***
## YearBuilt              1.911e+02  8.539e+01   2.238 0.025439 *  
## YearRemodAdd           6.970e+01  6.866e+01   1.015 0.310255    
## MasVnrArea             2.788e+01  6.961e+00   4.005 6.63e-05 ***
## BsmtFinSF1             3.299e+01  5.127e+00   6.433 1.88e-10 ***
## BsmtFinSF2             3.071e+01  1.015e+01   3.027 0.002527 ** 
## BsmtUnfSF              2.128e+01  4.714e+00   4.514 7.08e-06 ***
## TotalBsmtSF                   NA         NA      NA       NA    
## X1stFlrSF              2.794e+01  1.869e+01   1.494 0.135380    
## X2ndFlrSF              5.586e+01  1.873e+01   2.983 0.002916 ** 
## GrLivArea              2.016e+01  1.805e+01   1.117 0.264281    
## BsmtFullBath           4.271e+03  2.521e+03   1.695 0.090448 .  
## BsmtHalfBath           6.627e+01  3.849e+03   0.017 0.986266    
## FullBath               2.891e+03  2.765e+03   1.045 0.296114    
## HalfBath               2.153e+03  2.630e+03   0.819 0.413094    
## BedroomAbvGr          -4.388e+03  1.693e+03  -2.592 0.009675 ** 
## TotRmsAbvGrd           1.393e+03  1.147e+03   1.214 0.224967    
## Fireplaces             3.955e+03  1.824e+03   2.169 0.030295 *  
## GarageYrBlt            9.104e+01  6.835e+01   1.332 0.183125    
## GarageCars             3.051e+03  2.829e+03   1.078 0.281101    
## GarageArea             1.122e+01  9.980e+00   1.125 0.261041    
## WoodDeckSF             1.854e+01  7.693e+00   2.410 0.016120 *  
## OpenPorchSF           -1.627e+01  1.470e+01  -1.106 0.268811    
## MoSold                -3.592e+02  3.145e+02  -1.142 0.253668    
## YrSold                -1.998e+02  6.567e+02  -0.304 0.760973    
## MSZoning_RM           -4.996e+03  3.492e+03  -1.431 0.152853    
## MSZoning_other        -4.588e+03  5.468e+03  -0.839 0.401677    
## LotShape_Reg           6.286e+02  2.045e+03   0.307 0.758571    
## LandContour_other     -8.528e+03  3.493e+03  -2.441 0.014794 *  
## LotConfig_CulDSac      1.427e+04  4.188e+03   3.406 0.000684 ***
## LotConfig_Inside      -4.953e+02  2.131e+03  -0.232 0.816268    
## LandSlope_other        3.698e+03  4.791e+03   0.772 0.440414    
## Neighborhood_Edwards  -5.677e+03  4.870e+03  -1.166 0.243980    
## Neighborhood_Gilbert  -5.493e+02  5.116e+03  -0.107 0.914516    
## Neighborhood_NAmes     1.203e+03  4.439e+03   0.271 0.786500    
## Neighborhood_NridgHt   2.228e+04  5.298e+03   4.205 2.82e-05 ***
## Neighborhood_NWAmes   -1.148e+03  5.363e+03  -0.214 0.830515    
## Neighborhood_OldTown   3.974e+02  5.588e+03   0.071 0.943318    
## Neighborhood_Somerst   1.436e+04  6.234e+03   2.304 0.021441 *  
## Neighborhood_other     9.566e+03  3.632e+03   2.634 0.008555 ** 
## Condition1_Norm        1.027e+04  3.945e+03   2.602 0.009382 ** 
## Condition1_other      -6.319e+03  4.879e+03  -1.295 0.195574    
## BldgType_TwnhsE       -1.317e+04  7.548e+03  -1.745 0.081202 .  
## BldgType_other        -1.811e+04  6.947e+03  -2.607 0.009263 ** 
## HouseStyle_X1Story     1.430e+04  4.230e+03   3.381 0.000749 ***
## HouseStyle_X2Story    -5.298e+03  3.818e+03  -1.387 0.165616    
## HouseStyle_SLvl        6.489e+03  5.415e+03   1.198 0.231029    
## RoofStyle_Hip          9.942e+02  2.412e+03   0.412 0.680269    
## Exterior1st_Plywood    8.083e+02  5.723e+03   0.141 0.887722    
## Exterior1st_Wd.Sdng   -2.866e+03  6.394e+03  -0.448 0.654113    
## Exterior1st_other      1.403e+04  5.455e+03   2.573 0.010225 *  
## Exterior2nd_MetalSd    6.473e+03  3.456e+03   1.873 0.061323 .  
## Exterior2nd_Plywood   -6.115e+03  5.423e+03  -1.128 0.259734    
## Exterior2nd_VinylSd    2.760e+03  3.209e+03   0.860 0.389921    
## Exterior2nd_Wd.Sdng    7.488e+03  6.622e+03   1.131 0.258387    
## Exterior2nd_other     -5.033e+03  5.564e+03  -0.905 0.365928    
## MasVnrType_None        8.163e+03  2.784e+03   2.933 0.003432 ** 
## MasVnrType_Stone       8.392e+03  3.727e+03   2.252 0.024542 *  
## ExterQual_TA          -6.090e+03  2.984e+03  -2.041 0.041518 *  
## ExterCond_TA          -5.744e+02  2.817e+03  -0.204 0.838451    
## Foundation_CBlock      3.583e+03  3.642e+03   0.984 0.325452    
## Foundation_PConc       5.739e+03  4.096e+03   1.401 0.161439    
## BsmtQual_Gd           -1.813e+04  3.357e+03  -5.401 8.16e-08 ***
## BsmtQual_TA           -1.688e+04  3.674e+03  -4.595 4.84e-06 ***
## BsmtCond_other        -3.687e+03  3.374e+03  -1.093 0.274716    
## BsmtExposure_Gd        1.548e+04  3.878e+03   3.992 7.01e-05 ***
## BsmtExposure_Mn       -5.628e+03  3.965e+03  -1.419 0.156060    
## BsmtExposure_No       -8.096e+03  2.795e+03  -2.897 0.003845 ** 
## BsmtFinType1_BLQ      -1.237e+03  3.525e+03  -0.351 0.725650    
## BsmtFinType1_GLQ       3.400e+03  3.160e+03   1.076 0.282214    
## BsmtFinType1_LwQ      -6.436e+03  4.645e+03  -1.386 0.166159    
## BsmtFinType1_Rec      -3.754e+03  3.701e+03  -1.014 0.310606    
## BsmtFinType1_Unf      -2.399e+03  3.522e+03  -0.681 0.495862    
## BsmtFinType2_other    -5.539e+03  4.562e+03  -1.214 0.224938    
## HeatingQC_Gd          -8.272e+02  2.630e+03  -0.315 0.753182    
## HeatingQC_TA          -5.277e+02  2.494e+03  -0.212 0.832453    
## CentralAir_Y          -3.891e+01  4.373e+03  -0.009 0.992902    
## Electrical_SBrkr       1.952e+03  3.626e+03   0.538 0.590364    
## KitchenQual_Gd        -2.182e+04  3.515e+03  -6.208 7.67e-10 ***
## KitchenQual_TA        -1.852e+04  3.722e+03  -4.976 7.57e-07 ***
## Functional_other      -1.464e+04  3.651e+03  -4.009 6.52e-05 ***
## FireplaceQu_TA         2.483e+02  2.637e+03   0.094 0.925004    
## GarageType_BuiltIn    -4.800e+03  4.097e+03  -1.172 0.241600    
## GarageType_Detchd      2.173e+03  2.769e+03   0.785 0.432827    
## GarageFinish_RFn      -4.212e+03  2.458e+03  -1.714 0.086847 .  
## GarageFinish_Unf      -1.434e+03  2.936e+03  -0.488 0.625318    
## PavedDrive_Y           4.308e+03  3.668e+03   1.174 0.240464    
## SaleType_WD           -7.130e+04  2.069e+04  -3.446 0.000590 ***
## SaleType_other        -6.687e+04  2.065e+04  -3.237 0.001243 ** 
## SaleCondition_Normal   6.007e+03  3.045e+03   1.973 0.048782 *  
## SaleCondition_Partial -4.967e+04  2.071e+04  -2.398 0.016651 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27850 on 1073 degrees of freedom
## Multiple R-squared:  0.8907, Adjusted R-squared:  0.881 
## F-statistic: 92.04 on 95 and 1073 DF,  p-value: < 2.2e-16
linearM_results<- model1 %>%
  predict(new_data = trainset) %>% 
  mutate(
    truth = trainset$SalePrice,
    model = "LinearModel",
    set = "Training Set"
  ) %>% 
  bind_rows(model1 %>%
  predict(new_data = testset) %>%
  mutate(
    truth = testset$SalePrice,
    model = "LinearModel",
    set = "Testing Set"))

variableImportancePlot <- function(model=NULL, chart_title='Variable Importance Plot') {
  # Make sure a model was passed
  if (is.null(model)) {
    return
  }
  
  # use caret and gglot to print a variable importance plot
  varImp(model) %>% as.data.frame() %>% top_n(n = 10) %>% 
    ggplot(aes(x = reorder(rownames(.), desc(Overall)), y = Overall)) +
    geom_col(aes(fill = Overall)) +
    theme(panel.background = element_blank(),
          panel.grid = element_blank(),
          axis.text.x = element_text(angle = 90)) +
    scale_fill_gradient() +
    labs(title = chart_title,
         x = "Parameter",
         y = "Relative Importance")
}
variableImportancePlot(model1$fit, "Linear Model 1 Variable Importance")

linearM_results %>% 
  ggplot(aes(x = truth,y = .pred))+
  geom_point(pch =21, alpha = 0.3, fill = "skyblue") +
  geom_smooth(method = "lm", color = "red3") +
  facet_wrap(set~model, scales = 'free')+
stat_poly_eq(aes(label = paste(..adj.rr.label.., sep = "~~~")), 
                   label.x.npc = "left", label.y.npc = .9,
                   formula = y~x, parse = TRUE, size = 4)+
  theme(panel.background = element_blank(),
        panel.grid = element_blank())

The model produces a Multiple R-Squared of 0.89 on the test set which is great. We can interpret this to mean that 89% variance in the saleprice van be explained by the predictor variables in the current model. however the training set produced a lower R2 around 72% of variance being explained. Significant P values were obtained. with this we can test this against our kaggle dataset.


Model Testing and submission

Let us test how good our model performs on the test data set.

# Load Testing set
dftest = read.csv("test.csv")
dftest$SalePrice<-NA
dftest$BsmtFinSF2<-NA

LMRecipe<- dftrain %>% recipe(SalePrice~.) %>% 
  #remove all non numeric based variables
  step_rm(all_of(a)) %>%
  step_rm('BsmtFinSF2') %>% 
  step_other(all_nominal(),threshold = .05) %>% 
  step_dummy(all_nominal()) %>% 
  step_medianimpute(all_predictors()) %>%
  step_corr(all_predictors(),threshold = .9) %>% 
  #step_BoxCox(all_predictors()) %>% 
  step_nzv(all_predictors()) %>% 
  prep()

# Predict the saleprice for the testing data set
# 
trainset<-
  bake(LMRecipe, new_data = dftrain)
testset<-
  bake(LMRecipe, new_data = dftest)

model1<-lm_spec %>% 
  fit(SalePrice~.,data = trainset)

linearM_results<- model1 %>%
  predict(new_data = testset) %>% 
  mutate(Id = testset$Id,
                           SalePrice = .pred
) %>% select(-.pred)
#write.csv(linearM_results, file = "kagglesubmission.csv", row.names = F)

#View the first 5 rows of the data frame
print(head(linearM_results, 5))
## # A tibble: 5 x 2
##      Id SalePrice
##   <int>     <dbl>
## 1  1461    81008.
## 2  1462   150425.
## 3  1463   171205.
## 4  1464   182365.
## 5  1465   167906.

Kaggle Submission - Score

Report your Kaggle.com user name and score.

Kaggle Username: Joshuargst

Kaggle Score: 0.32199 and ranked 3972 on the leaderboads