library(tidyverse)
library(moments)
library(ggcorrplot)
library(reshape2)
library(naniar)
library(corrplot)
library(DescTools)

Import the data

Note I just downloaded the data from Kaggle and uploaded it to my GitHub, then pulled the data in that way

test<-read.csv("https://raw.githubusercontent.com/jonburns2454/DATA-605/main/final_data/test.csv")
train<-read.csv("https://raw.githubusercontent.com/jonburns2454/DATA-605/main/final_data/train.csv")
summary(train)
##        Id           MSSubClass      MSZoning          LotFrontage    
##  Min.   :   1.0   Min.   : 20.0   Length:1460        Min.   : 21.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   Class :character   1st Qu.: 59.00  
##  Median : 730.5   Median : 50.0   Mode  :character   Median : 69.00  
##  Mean   : 730.5   Mean   : 56.9                      Mean   : 70.05  
##  3rd Qu.:1095.2   3rd Qu.: 70.0                      3rd Qu.: 80.00  
##  Max.   :1460.0   Max.   :190.0                      Max.   :313.00  
##                                                      NA's   :259     
##     LotArea          Street             Alley             LotShape        
##  Min.   :  1300   Length:1460        Length:1460        Length:1460       
##  1st Qu.:  7554   Class :character   Class :character   Class :character  
##  Median :  9478   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 10517                                                           
##  3rd Qu.: 11602                                                           
##  Max.   :215245                                                           
##                                                                           
##  LandContour         Utilities          LotConfig          LandSlope        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Neighborhood        Condition1         Condition2          BldgType        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   HouseStyle         OverallQual      OverallCond      YearBuilt   
##  Length:1460        Min.   : 1.000   Min.   :1.000   Min.   :1872  
##  Class :character   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954  
##  Mode  :character   Median : 6.000   Median :5.000   Median :1973  
##                     Mean   : 6.099   Mean   :5.575   Mean   :1971  
##                     3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000  
##                     Max.   :10.000   Max.   :9.000   Max.   :2010  
##                                                                    
##   YearRemodAdd   RoofStyle           RoofMatl         Exterior1st       
##  Min.   :1950   Length:1460        Length:1460        Length:1460       
##  1st Qu.:1967   Class :character   Class :character   Class :character  
##  Median :1994   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1985                                                           
##  3rd Qu.:2004                                                           
##  Max.   :2010                                                           
##                                                                         
##  Exterior2nd         MasVnrType          MasVnrArea      ExterQual        
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median :   0.0   Mode  :character  
##                                        Mean   : 103.7                     
##                                        3rd Qu.: 166.0                     
##                                        Max.   :1600.0                     
##                                        NA's   :8                          
##   ExterCond          Foundation          BsmtQual           BsmtCond        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  BsmtExposure       BsmtFinType1         BsmtFinSF1     BsmtFinType2      
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median : 383.5   Mode  :character  
##                                        Mean   : 443.6                     
##                                        3rd Qu.: 712.2                     
##                                        Max.   :5644.0                     
##                                                                           
##    BsmtFinSF2        BsmtUnfSF       TotalBsmtSF       Heating         
##  Min.   :   0.00   Min.   :   0.0   Min.   :   0.0   Length:1460       
##  1st Qu.:   0.00   1st Qu.: 223.0   1st Qu.: 795.8   Class :character  
##  Median :   0.00   Median : 477.5   Median : 991.5   Mode  :character  
##  Mean   :  46.55   Mean   : 567.2   Mean   :1057.4                     
##  3rd Qu.:   0.00   3rd Qu.: 808.0   3rd Qu.:1298.2                     
##  Max.   :1474.00   Max.   :2336.0   Max.   :6110.0                     
##                                                                        
##   HeatingQC          CentralAir         Electrical          X1stFlrSF   
##  Length:1460        Length:1460        Length:1460        Min.   : 334  
##  Class :character   Class :character   Class :character   1st Qu.: 882  
##  Mode  :character   Mode  :character   Mode  :character   Median :1087  
##                                                           Mean   :1163  
##                                                           3rd Qu.:1391  
##                                                           Max.   :4692  
##                                                                         
##    X2ndFlrSF     LowQualFinSF       GrLivArea     BsmtFullBath   
##  Min.   :   0   Min.   :  0.000   Min.   : 334   Min.   :0.0000  
##  1st Qu.:   0   1st Qu.:  0.000   1st Qu.:1130   1st Qu.:0.0000  
##  Median :   0   Median :  0.000   Median :1464   Median :0.0000  
##  Mean   : 347   Mean   :  5.845   Mean   :1515   Mean   :0.4253  
##  3rd Qu.: 728   3rd Qu.:  0.000   3rd Qu.:1777   3rd Qu.:1.0000  
##  Max.   :2065   Max.   :572.000   Max.   :5642   Max.   :3.0000  
##                                                                  
##   BsmtHalfBath        FullBath        HalfBath       BedroomAbvGr  
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :0.00000   Median :2.000   Median :0.0000   Median :3.000  
##  Mean   :0.05753   Mean   :1.565   Mean   :0.3829   Mean   :2.866  
##  3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :2.00000   Max.   :3.000   Max.   :2.0000   Max.   :8.000  
##                                                                    
##   KitchenAbvGr   KitchenQual         TotRmsAbvGrd     Functional       
##  Min.   :0.000   Length:1460        Min.   : 2.000   Length:1460       
##  1st Qu.:1.000   Class :character   1st Qu.: 5.000   Class :character  
##  Median :1.000   Mode  :character   Median : 6.000   Mode  :character  
##  Mean   :1.047                      Mean   : 6.518                     
##  3rd Qu.:1.000                      3rd Qu.: 7.000                     
##  Max.   :3.000                      Max.   :14.000                     
##                                                                        
##    Fireplaces    FireplaceQu         GarageType         GarageYrBlt  
##  Min.   :0.000   Length:1460        Length:1460        Min.   :1900  
##  1st Qu.:0.000   Class :character   Class :character   1st Qu.:1961  
##  Median :1.000   Mode  :character   Mode  :character   Median :1980  
##  Mean   :0.613                                         Mean   :1979  
##  3rd Qu.:1.000                                         3rd Qu.:2002  
##  Max.   :3.000                                         Max.   :2010  
##                                                        NA's   :81    
##  GarageFinish         GarageCars      GarageArea      GarageQual       
##  Length:1460        Min.   :0.000   Min.   :   0.0   Length:1460       
##  Class :character   1st Qu.:1.000   1st Qu.: 334.5   Class :character  
##  Mode  :character   Median :2.000   Median : 480.0   Mode  :character  
##                     Mean   :1.767   Mean   : 473.0                     
##                     3rd Qu.:2.000   3rd Qu.: 576.0                     
##                     Max.   :4.000   Max.   :1418.0                     
##                                                                        
##   GarageCond         PavedDrive          WoodDeckSF      OpenPorchSF    
##  Length:1460        Length:1460        Min.   :  0.00   Min.   :  0.00  
##  Class :character   Class :character   1st Qu.:  0.00   1st Qu.:  0.00  
##  Mode  :character   Mode  :character   Median :  0.00   Median : 25.00  
##                                        Mean   : 94.24   Mean   : 46.66  
##                                        3rd Qu.:168.00   3rd Qu.: 68.00  
##                                        Max.   :857.00   Max.   :547.00  
##                                                                         
##  EnclosedPorch      X3SsnPorch      ScreenPorch        PoolArea      
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.000  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.000  
##  Median :  0.00   Median :  0.00   Median :  0.00   Median :  0.000  
##  Mean   : 21.95   Mean   :  3.41   Mean   : 15.06   Mean   :  2.759  
##  3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.000  
##  Max.   :552.00   Max.   :508.00   Max.   :480.00   Max.   :738.000  
##                                                                      
##     PoolQC             Fence           MiscFeature           MiscVal        
##  Length:1460        Length:1460        Length:1460        Min.   :    0.00  
##  Class :character   Class :character   Class :character   1st Qu.:    0.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    0.00  
##                                                           Mean   :   43.49  
##                                                           3rd Qu.:    0.00  
##                                                           Max.   :15500.00  
##                                                                             
##      MoSold           YrSold       SaleType         SaleCondition     
##  Min.   : 1.000   Min.   :2006   Length:1460        Length:1460       
##  1st Qu.: 5.000   1st Qu.:2007   Class :character   Class :character  
##  Median : 6.000   Median :2008   Mode  :character   Mode  :character  
##  Mean   : 6.322   Mean   :2008                                        
##  3rd Qu.: 8.000   3rd Qu.:2009                                        
##  Max.   :12.000   Max.   :2010                                        
##                                                                       
##    SalePrice     
##  Min.   : 34900  
##  1st Qu.:129975  
##  Median :163000  
##  Mean   :180921  
##  3rd Qu.:214000  
##  Max.   :755000  
## 

Choose variables and check skew:

Pick one of the quantitative independent variables from the training data set (train.csv) , and define that variable as X. Make sure this variable is skewed to the right! Pick the dependent variable and define it as Y.

Sifting through the data the OpenPorchSF variable seems to fit the task:

ggplot(train, aes(x = train$OpenPorchSF))+
    geom_histogram()
## Warning: Use of `train$OpenPorchSF` is discouraged.
## ℹ Use `OpenPorchSF` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

A score over one should be more than enough to prove a variables skeweness.

skewness(train$OpenPorchSF)
## [1] 2.361912

Set variables

X <- train$OpenPorchSF
Y <- train$SalePrice

Probability.

Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 3d quartile of the X variable, and the small letter “y” is estimated as the 2d quartile of the Y variable. Interpret the meaning of all probabilities. In addition, make a table of counts as shown below.

x <- quantile(X,.75)
y <- quantile(Y, 0.50)
  1. P(X>x | Y>y) b. P(X>x, Y>y) c. P(X<x | Y>y)
a <- sum(X > x & Y > y)/sum(Y>y)

b <- sum(X > x & Y > y)/nrow(train)

c <- sum(X < x & Y > y)/sum(Y>y)

Make a table of counts:

table_of_counts <- table(X > x, Y > y)
print(table_of_counts)
##        
##         FALSE TRUE
##   FALSE   637  461
##   TRUE     95  267
colnames(table_of_counts) <- c("<=2d quartile", ">2d quartile")
rownames(table_of_counts) <- c("<=3d quartile", ">3d quartile")
table_of_counts <- addmargins(table_of_counts)
print(table_of_counts)
##                
##                 <=2d quartile >2d quartile  Sum
##   <=3d quartile           637          461 1098
##   >3d quartile             95          267  362
##   Sum                     732          728 1460

Interpereting probabilities a-c

Interpret the meanings of the probabilities: a. P(X>x | Y>y)
The probability of X being greater than x (the third quantile) given that Y is greater than y (the second quantile). In context of the data itself, it means the probability that OpenPorchSF is greater than the third quantile, when the SalePrice is greater than the second percentile. A total probability of around .366758.

  1. P(X>x, Y>y)
    The probability of both OpenPorchSF being greater than the 3rd quantile and the SalePrice being greater than the second quantile is 0.182876

  2. P(X<x | Y>y) Lastly, the probability of OpenPorchSF being less than the third quartile given that SalePrice is above the second quartile is 0.625

Does splitting the training data in this fashion make them independent? Let A be the new variable counting those observations above the 3d quartile for X, and let B be the new variable counting those observations above the 2d quartile for Y. Does P(A|B)=P(A)P(B)? Check mathematically, and then evaluate by running a Chi Square test for association.

Define A and B and calculate the probabilities for P(A|B)=P(A)P(B)

A <- as.numeric(X > x)
B <- as.numeric(Y > y)

#P(A|B)

P_A_B <- sum(A & B)/sum(B)

# P(A) and P(B)

P_A <- sum(A) / length(A)
P_B <- sum(B) / length(B)

# Conditional joint probabilities

P_A_P_B <- P_A * P_B 

print(paste("The probability of P(A|B) is:", P_A_B, ", while the probability of P(A)P(B) is:", P_A_P_B))
## [1] "The probability of P(A|B) is: 0.366758241758242 , while the probability of P(A)P(B) is: 0.123632951773316"

Chi-Square test:

Lastly, run a Chi-square test to validate this finding.

chi_a_b <- chisq.test(A,B)

print(chi_a_b)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  A and B
## X-squared = 108.66, df = 1, p-value < 2.2e-16

Chi-square results:

The null hypothesis in this instance contends that OpenAreaSF and SalePrice variables are independent from one another. The Chi-Square produced a very low pvalue (2.2e-16) and a high X-squared, indicating that we should reject the null, suggesting a dependent relationship between OpenAreaSF and SalePrice.

Descriptive and Inferential Statistics

Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot of X and Y. Provide a 95% CI for the difference in the mean of the variables. Derive a correlation matrix for two of the quantitative variables you selected. Test the hypothesis that the correlation between these variables is 0 and provide a 99% confidence interval. Discuss the meaning of your analysis.

Baseline Stats

summary(X)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00   25.00   46.66   68.00  547.00
summary(Y)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129975  163000  180921  214000  755000

Missingness across the dataset

missing_train <- train %>%
  summarise_all(~ sum(is.na(.))) %>%
  pivot_longer(cols = everything(), names_to = "variable", values_to = "missing_count") %>% 
    filter(missing_count>0)
# Its important to not that these are the only variables with missingness.
ggplot(missing_train, aes(x = reorder(variable, -missing_count), y = missing_count)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  coord_flip() +
  labs(title = "Count of Missing Values by Variable",
       x = "Variable",
       y = "Count of Missing Values") +
  theme_minimal() +
  theme(axis.text = element_text(size = 12),
        axis.title = element_text(size = 14),
        plot.title = element_text(size = 16, face = "bold"))

The missingness will eventually be a problem, so this training df will require some form of imputation down the road.

Numeric Distribution of variables:

train %>% 
    keep(is.numeric) %>% 
    gather(key="variable", value = "value") %>% 
    ggplot(aes(x=variable, y=value))+
    geom_violin()+
    facet_wrap(~variable, scales = 'free')+
    theme(strip.text = element_text(size=6))+
    theme(axis.text.y = element_text(size = 5))
## Warning: Removed 348 rows containing non-finite values (`stat_ydensity()`).

Scatter Plot

train %>% 
    ggplot(aes(x=OpenPorchSF, y = SalePrice))+
    geom_point()+
    labs(x = "OpenPorchSF(X)", y = "SalesPrice(Y)", title = "OpenPorchSF and SalesPrice Viz")+
    geom_smooth(method="lm", se=FALSE)
## `geom_smooth()` using formula = 'y ~ x'

Confidence Interval - 95%

Provide a 95% CI for the difference in the mean of the variables.

# Ill use a t-test for this

t.test(x = train$OpenPorchSF,y = train$SalePrice, conf.level = 0.95)
## 
##  Welch Two Sample t-test
## 
## data:  train$OpenPorchSF and train$SalePrice
## t = -86.996, df = 1459, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -184952.9 -176796.2
## sample estimates:
##    mean of x    mean of y 
##     46.66027 180921.19589

Test the hypothesis that the correlation between these variables is 0 and provide a 99% confidence interval. Discuss the meaning of your analysis.

cor_test_1 <- cor.test(train$OpenPorchSF, train$SalePrice, method = "pearson", conf.level = 0.99)
print(cor_test_1)
## 
##  Pearson's product-moment correlation
## 
## data:  train$OpenPorchSF and train$SalePrice
## t = 12.711, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
##  0.2538797 0.3752497
## sample estimates:
##       cor 
## 0.3158562
cor_test_2 <- cor.test(train$OpenPorchSF, train$SalePrice, method = "kendall", conf.level = 0.99)
print(cor_test_2)
## 
##  Kendall's rank correlation tau
## 
## data:  train$OpenPorchSF and train$SalePrice
## z = 18.724, p-value < 2.2e-16
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
##      tau 
## 0.350161

I wanted to compare two different correlation methods, Pearson and Kendall. Both methods yielded statistically significant result with a low pvalue and a high z value. However, Kendall is used when evaluating data that is not normally distributed so the Kendall cor.test is more promising than Pearson.

Test the hypothesis

Test the hypothesis that the correlation between these variables is 0 and provide a 99% confidence interval. Discuss the meaning of your analysis.

In terms of the question, the Kendall Correlation Test reveals that the null hypothesis is rejected, backed by the very low pvalue, the relationship between Open Porch Square footage and Sales Price is statistically significant.

Confidence Interval:

t.test(train$OpenPorchSF,train$SalePrice, conf.level = 0.99)
## 
##  Welch Two Sample t-test
## 
## data:  train$OpenPorchSF and train$SalePrice
## t = -86.996, df = 1459, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 99 percent confidence interval:
##  -186237.0 -175512.1
## sample estimates:
##    mean of x    mean of y 
##     46.66027 180921.19589

This confidence interval indicates that there is 99% confidence that the that the difference in means between OpenPorchSF and SalePrice is between -186237.0 -175512.1.

Linear Algebra and Correlation.

###Invert your correlation matrix. (This is known as the precision matrix and contains variance inflation factors on the diagonal.)

cor_X_Y <- cor(train[,c("OpenPorchSF","SalePrice")])
print(cor_X_Y)
##             OpenPorchSF SalePrice
## OpenPorchSF   1.0000000 0.3158562
## SalePrice     0.3158562 1.0000000
precision_mat <- solve(cor_X_Y)
print(precision_mat)
##             OpenPorchSF  SalePrice
## OpenPorchSF   1.1108213 -0.3508598
## SalePrice    -0.3508598  1.1108213

Multiply the correlation matrix by the precision matrix.

cor_X_Y * precision_mat
##             OpenPorchSF  SalePrice
## OpenPorchSF   1.1108213 -0.1108213
## SalePrice    -0.1108213  1.1108213

Multiply the precision matrix by the correlation matrix

precision_mat * cor_X_Y
##             OpenPorchSF  SalePrice
## OpenPorchSF   1.1108213 -0.1108213
## SalePrice    -0.1108213  1.1108213

Principal Component Analysis

Conduct principle components analysis (research this!) and interpret. Discuss.

pca_results <- prcomp(train[,c("OpenPorchSF","SalePrice")], scale = TRUE)
summary(pca_results)
## Importance of components:
##                           PC1    PC2
## Standard deviation     1.1471 0.8271
## Proportion of Variance 0.6579 0.3421
## Cumulative Proportion  0.6579 1.0000

Vizualizing the PCA Analysis:

library("factoextra")
fviz_pca_ind(pca_results, col.ind = "coral3")

fviz_eig(pca_results, addlabels = T)

Discussion - PCA Results:

The summary of the principal component analysis captures the amount of variance that is explained by each principal component. Captured in both the summary(PCA_results) and the scree plot, dimension 1 covers 65.8% of the variance in the bivariate dataset and dimension 2 captures 34.2% of this variance. These components capture all of the variance within the dataset, however it is important to note that there are only two dimensions in this df because there are only two variables present in the PCA reduction (OpenPorchSF, SalePrice).

Calculus-Based Probability & Statistics.

Many times, it makes sense to fit a closed form distribution to data. For your variable that is skewed to the right, shift it so that the minimum value is above zero. Then load the MASS package and run fitdistr to fit an exponential probability density function.(See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of \(\lambda\) for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, \(\lambda\))). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

Check variables for anything below zero

I would say that having a negative value is not possible for these two variables to be negative, since a house can have negative value, nor negative square footage of an open porch. Nonetheless, OpenPorchSF needs to be adjusted so there are no 0 values.

summary(train$OpenPorchSF)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00   25.00   46.66   68.00  547.00
summary(train$SalePrice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129975  163000  180921  214000  755000
shift_X <- train$OpenPorchSF - min(train$OpenPorchSF)+1

load the MASS package and run fitdistr to fit an exponential probability density function.(See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ).

library(MASS)
library(gridExtra)

Find the optimal value of \(\lambda\) for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, \(\lambda\))).

fit_X <- fitdistr(shift_X, densfun = "exponential")
lambda<-fit_X$estimate
sample<- rexp(1000, rate = lambda)

Comparing Original VS Sample DF

Plot a histogram and compare it with a histogram of your original variable.

par(mfrow = c(1,2))
hist(train$OpenPorchSF, main=' Original Open PorchSF', xlab = 'Square Footage', breaks = 30)
hist(sample, main = "Exponsntial Distribution Sample", xlab = "Sample Square Footage", col = 'blue', breaks = 30)

Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF).

perc_5 <- qexp(0.05, rate = lambda)
perc_95 <- qexp(0.95, rate = lambda)
print(paste(perc_5,perc_95))
## [1] "2.44465246346802 142.777420906151"

95% CI for the empirical data

Also generate a 95% confidence interval from the empirical data, assuming normality.

mu <- mean(train$OpenPorchSF)

Std_Error <- sd(train$OpenPorchSF)/sqrt(nrow(train))

Marg_Error <- qt(0.975, df = nrow(train)-1)*Std_Error

confidence_int <-c(mu - Marg_Error, mu + Marg_Error)

print(confidence_int)
## [1] 43.25888 50.06167

The results from the CI for the empirical data tells me that I can be 95% confident that the mean of the original Open Porch square footage variable is within 43.25 SF and 50.06 SF.

This can also be verified with a t.test.The CIs match meaning my calculations from above can be trusted.

CI <- t.test(train$OpenPorchSF)

print(CI)
## 
##  One Sample t-test
## 
## data:  train$OpenPorchSF
## t = 26.909, df = 1459, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  43.25888 50.06167
## sample estimates:
## mean of x 
##  46.66027

Empirical 5th percentile and 95th percentile

Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

perc_95 <- quantile(train$OpenPorchSF,probs=0.95)
perc_5 <- quantile(train$OpenPorchSF,probs=0.05)
print(paste("The 5th percentile:",perc_5,", The 95th percentile:", perc_95))
## [1] "The 5th percentile: 0 , The 95th percentile: 175.05"

This can be interpreted as 90% of the data is distributed between the intervals of 0 and 175.05 square feet. Looking back at the summary(train$OpenPorchSF) the data is so clumped around zero that the min and 1st. Quantile were both zero, so a 5th percentile makes sense given the context.

summary(train$OpenPorchSF)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00   25.00   46.66   68.00  547.00

Modeling.

Build some type of regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.

Just from a brief look at the test and train datasets earlier I know that there will a be a problem with missing values. Due to this I will try an imputation technique.

Check NA’s

train %>% 
    summarize_all(~sum(is.na(.))) %>% 
    glimpse()
## Rows: 1
## Columns: 81
## $ Id            <int> 0
## $ MSSubClass    <int> 0
## $ MSZoning      <int> 0
## $ LotFrontage   <int> 259
## $ LotArea       <int> 0
## $ Street        <int> 0
## $ Alley         <int> 1369
## $ LotShape      <int> 0
## $ LandContour   <int> 0
## $ Utilities     <int> 0
## $ LotConfig     <int> 0
## $ LandSlope     <int> 0
## $ Neighborhood  <int> 0
## $ Condition1    <int> 0
## $ Condition2    <int> 0
## $ BldgType      <int> 0
## $ HouseStyle    <int> 0
## $ OverallQual   <int> 0
## $ OverallCond   <int> 0
## $ YearBuilt     <int> 0
## $ YearRemodAdd  <int> 0
## $ RoofStyle     <int> 0
## $ RoofMatl      <int> 0
## $ Exterior1st   <int> 0
## $ Exterior2nd   <int> 0
## $ MasVnrType    <int> 8
## $ MasVnrArea    <int> 8
## $ ExterQual     <int> 0
## $ ExterCond     <int> 0
## $ Foundation    <int> 0
## $ BsmtQual      <int> 37
## $ BsmtCond      <int> 37
## $ BsmtExposure  <int> 38
## $ BsmtFinType1  <int> 37
## $ BsmtFinSF1    <int> 0
## $ BsmtFinType2  <int> 38
## $ BsmtFinSF2    <int> 0
## $ BsmtUnfSF     <int> 0
## $ TotalBsmtSF   <int> 0
## $ Heating       <int> 0
## $ HeatingQC     <int> 0
## $ CentralAir    <int> 0
## $ Electrical    <int> 1
## $ X1stFlrSF     <int> 0
## $ X2ndFlrSF     <int> 0
## $ LowQualFinSF  <int> 0
## $ GrLivArea     <int> 0
## $ BsmtFullBath  <int> 0
## $ BsmtHalfBath  <int> 0
## $ FullBath      <int> 0
## $ HalfBath      <int> 0
## $ BedroomAbvGr  <int> 0
## $ KitchenAbvGr  <int> 0
## $ KitchenQual   <int> 0
## $ TotRmsAbvGrd  <int> 0
## $ Functional    <int> 0
## $ Fireplaces    <int> 0
## $ FireplaceQu   <int> 690
## $ GarageType    <int> 81
## $ GarageYrBlt   <int> 81
## $ GarageFinish  <int> 81
## $ GarageCars    <int> 0
## $ GarageArea    <int> 0
## $ GarageQual    <int> 81
## $ GarageCond    <int> 81
## $ PavedDrive    <int> 0
## $ WoodDeckSF    <int> 0
## $ OpenPorchSF   <int> 0
## $ EnclosedPorch <int> 0
## $ X3SsnPorch    <int> 0
## $ ScreenPorch   <int> 0
## $ PoolArea      <int> 0
## $ PoolQC        <int> 1453
## $ Fence         <int> 1179
## $ MiscFeature   <int> 1406
## $ MiscVal       <int> 0
## $ MoSold        <int> 0
## $ YrSold        <int> 0
## $ SaleType      <int> 0
## $ SaleCondition <int> 0
## $ SalePrice     <int> 0
test %>% 
    summarize_all(~sum(is.na(.))) %>% 
    glimpse()
## Rows: 1
## Columns: 80
## $ Id            <int> 0
## $ MSSubClass    <int> 0
## $ MSZoning      <int> 4
## $ LotFrontage   <int> 227
## $ LotArea       <int> 0
## $ Street        <int> 0
## $ Alley         <int> 1352
## $ LotShape      <int> 0
## $ LandContour   <int> 0
## $ Utilities     <int> 2
## $ LotConfig     <int> 0
## $ LandSlope     <int> 0
## $ Neighborhood  <int> 0
## $ Condition1    <int> 0
## $ Condition2    <int> 0
## $ BldgType      <int> 0
## $ HouseStyle    <int> 0
## $ OverallQual   <int> 0
## $ OverallCond   <int> 0
## $ YearBuilt     <int> 0
## $ YearRemodAdd  <int> 0
## $ RoofStyle     <int> 0
## $ RoofMatl      <int> 0
## $ Exterior1st   <int> 1
## $ Exterior2nd   <int> 1
## $ MasVnrType    <int> 16
## $ MasVnrArea    <int> 15
## $ ExterQual     <int> 0
## $ ExterCond     <int> 0
## $ Foundation    <int> 0
## $ BsmtQual      <int> 44
## $ BsmtCond      <int> 45
## $ BsmtExposure  <int> 44
## $ BsmtFinType1  <int> 42
## $ BsmtFinSF1    <int> 1
## $ BsmtFinType2  <int> 42
## $ BsmtFinSF2    <int> 1
## $ BsmtUnfSF     <int> 1
## $ TotalBsmtSF   <int> 1
## $ Heating       <int> 0
## $ HeatingQC     <int> 0
## $ CentralAir    <int> 0
## $ Electrical    <int> 0
## $ X1stFlrSF     <int> 0
## $ X2ndFlrSF     <int> 0
## $ LowQualFinSF  <int> 0
## $ GrLivArea     <int> 0
## $ BsmtFullBath  <int> 2
## $ BsmtHalfBath  <int> 2
## $ FullBath      <int> 0
## $ HalfBath      <int> 0
## $ BedroomAbvGr  <int> 0
## $ KitchenAbvGr  <int> 0
## $ KitchenQual   <int> 1
## $ TotRmsAbvGrd  <int> 0
## $ Functional    <int> 2
## $ Fireplaces    <int> 0
## $ FireplaceQu   <int> 730
## $ GarageType    <int> 76
## $ GarageYrBlt   <int> 78
## $ GarageFinish  <int> 78
## $ GarageCars    <int> 1
## $ GarageArea    <int> 1
## $ GarageQual    <int> 78
## $ GarageCond    <int> 78
## $ PavedDrive    <int> 0
## $ WoodDeckSF    <int> 0
## $ OpenPorchSF   <int> 0
## $ EnclosedPorch <int> 0
## $ X3SsnPorch    <int> 0
## $ ScreenPorch   <int> 0
## $ PoolArea      <int> 0
## $ PoolQC        <int> 1456
## $ Fence         <int> 1169
## $ MiscFeature   <int> 1408
## $ MiscVal       <int> 0
## $ MoSold        <int> 0
## $ YrSold        <int> 0
## $ SaleType      <int> 1
## $ SaleCondition <int> 0

Select only the numeric rows:

num_train <- train %>%
    select_if(is.numeric)
    
num_test <- test %>%
    select_if(is.numeric)

Impute:

library(mice)
set.seed(1999)

impute_train <- mice(num_train, m=1, method = 'pmm', maxit = 5)
## 
##  iter imp variable
##   1   1  LotFrontage*  MasVnrArea*  GarageYrBlt*
##   2   1  LotFrontage*  MasVnrArea*  GarageYrBlt*
##   3   1  LotFrontage*  MasVnrArea*  GarageYrBlt*
##   4   1  LotFrontage*  MasVnrArea*  GarageYrBlt*
##   5   1  LotFrontage*  MasVnrArea*  GarageYrBlt*
comp_im_train <- complete(impute_train)

impute_test <- mice(num_test, m=1, method = 'pmm', maxit = 5)
## 
##  iter imp variable
##   1   1  LotFrontage*  MasVnrArea*  BsmtFinSF1*  BsmtFinSF2*  BsmtUnfSF*  TotalBsmtSF*  BsmtFullBath*  BsmtHalfBath*  GarageYrBlt*  GarageCars*  GarageArea*
##   2   1  LotFrontage*  MasVnrArea*  BsmtFinSF1*  BsmtFinSF2*  BsmtUnfSF*  TotalBsmtSF*  BsmtFullBath*  BsmtHalfBath*  GarageYrBlt*  GarageCars*  GarageArea*
##   3   1  LotFrontage*  MasVnrArea*  BsmtFinSF1*  BsmtFinSF2*  BsmtUnfSF*  TotalBsmtSF*  BsmtFullBath*  BsmtHalfBath*  GarageYrBlt*  GarageCars*  GarageArea*
##   4   1  LotFrontage*  MasVnrArea*  BsmtFinSF1*  BsmtFinSF2*  BsmtUnfSF*  TotalBsmtSF*  BsmtFullBath*  BsmtHalfBath*  GarageYrBlt*  GarageCars*  GarageArea*
##   5   1  LotFrontage*  MasVnrArea*  BsmtFinSF1*  BsmtFinSF2*  BsmtUnfSF*  TotalBsmtSF*  BsmtFullBath*  BsmtHalfBath*  GarageYrBlt*  GarageCars*  GarageArea*
comp_im_test <- complete(impute_test)

Reset names:

train_df <-comp_im_train
test_df <-comp_im_test

Check missing again

train_df %>% 
    summarize_all(~sum(is.na(.))) %>% 
    glimpse()
## Rows: 1
## Columns: 38
## $ Id            <int> 0
## $ MSSubClass    <int> 0
## $ LotFrontage   <int> 0
## $ LotArea       <int> 0
## $ OverallQual   <int> 0
## $ OverallCond   <int> 0
## $ YearBuilt     <int> 0
## $ YearRemodAdd  <int> 0
## $ MasVnrArea    <int> 0
## $ BsmtFinSF1    <int> 0
## $ BsmtFinSF2    <int> 0
## $ BsmtUnfSF     <int> 0
## $ TotalBsmtSF   <int> 0
## $ X1stFlrSF     <int> 0
## $ X2ndFlrSF     <int> 0
## $ LowQualFinSF  <int> 0
## $ GrLivArea     <int> 0
## $ BsmtFullBath  <int> 0
## $ BsmtHalfBath  <int> 0
## $ FullBath      <int> 0
## $ HalfBath      <int> 0
## $ BedroomAbvGr  <int> 0
## $ KitchenAbvGr  <int> 0
## $ TotRmsAbvGrd  <int> 0
## $ Fireplaces    <int> 0
## $ GarageYrBlt   <int> 0
## $ GarageCars    <int> 0
## $ GarageArea    <int> 0
## $ WoodDeckSF    <int> 0
## $ OpenPorchSF   <int> 0
## $ EnclosedPorch <int> 0
## $ X3SsnPorch    <int> 0
## $ ScreenPorch   <int> 0
## $ PoolArea      <int> 0
## $ MiscVal       <int> 0
## $ MoSold        <int> 0
## $ YrSold        <int> 0
## $ SalePrice     <int> 0
test_df %>%
    summarize_all(~sum(is.na(.))) %>% 
    glimpse()
## Rows: 1
## Columns: 37
## $ Id            <int> 0
## $ MSSubClass    <int> 0
## $ LotFrontage   <int> 0
## $ LotArea       <int> 0
## $ OverallQual   <int> 0
## $ OverallCond   <int> 0
## $ YearBuilt     <int> 0
## $ YearRemodAdd  <int> 0
## $ MasVnrArea    <int> 0
## $ BsmtFinSF1    <int> 0
## $ BsmtFinSF2    <int> 0
## $ BsmtUnfSF     <int> 0
## $ TotalBsmtSF   <int> 0
## $ X1stFlrSF     <int> 0
## $ X2ndFlrSF     <int> 0
## $ LowQualFinSF  <int> 0
## $ GrLivArea     <int> 0
## $ BsmtFullBath  <int> 0
## $ BsmtHalfBath  <int> 0
## $ FullBath      <int> 0
## $ HalfBath      <int> 0
## $ BedroomAbvGr  <int> 0
## $ KitchenAbvGr  <int> 0
## $ TotRmsAbvGrd  <int> 0
## $ Fireplaces    <int> 0
## $ GarageYrBlt   <int> 0
## $ GarageCars    <int> 0
## $ GarageArea    <int> 0
## $ WoodDeckSF    <int> 0
## $ OpenPorchSF   <int> 0
## $ EnclosedPorch <int> 0
## $ X3SsnPorch    <int> 0
## $ ScreenPorch   <int> 0
## $ PoolArea      <int> 0
## $ MiscVal       <int> 0
## $ MoSold        <int> 0
## $ YrSold        <int> 0

The data has been imputed properly in the test and train datasets.

Build the model

model_1 <- lm(SalePrice~OpenPorchSF, train_df)

summary(model_1)
## 
## Call:
## lm(formula = SalePrice ~ OpenPorchSF, data = train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -326420  -43750  -14250   26036  572814 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 163250.07    2413.86   67.63   <2e-16 ***
## OpenPorchSF    378.72      29.79   12.71   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 75400 on 1458 degrees of freedom
## Multiple R-squared:  0.09977,    Adjusted R-squared:  0.09915 
## F-statistic: 161.6 on 1 and 1458 DF,  p-value: < 2.2e-16

The main predictor variable OpenPorchSF is statistically significant. Now to build a regression model for the entire numerical dataset

model_2 <- lm(SalePrice~OpenPorchSF+., train_df)
summary(model_2)
## 
## Call:
## lm(formula = SalePrice ~ OpenPorchSF + ., data = train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -472650  -16212   -2041   13637  303283 
## 
## Coefficients: (2 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    5.187e+05  1.413e+06   0.367 0.713546    
## OpenPorchSF   -1.802e-01  1.516e+01  -0.012 0.990522    
## Id            -9.036e-01  2.184e+00  -0.414 0.679160    
## MSSubClass    -1.719e+02  2.646e+01  -6.494 1.15e-10 ***
## LotFrontage   -3.484e+01  1.759e+01  -1.981 0.047802 *  
## LotArea        4.199e-01  1.015e-01   4.138 3.71e-05 ***
## OverallQual    1.724e+04  1.191e+03  14.482  < 2e-16 ***
## OverallCond    4.416e+03  1.028e+03   4.297 1.85e-05 ***
## YearBuilt      3.697e+02  6.835e+01   5.409 7.43e-08 ***
## YearRemodAdd   1.811e+02  6.836e+01   2.649 0.008167 ** 
## MasVnrArea     2.977e+01  5.627e+00   5.290 1.41e-07 ***
## BsmtFinSF1     1.877e+01  4.661e+00   4.026 5.96e-05 ***
## BsmtFinSF2     7.797e+00  7.054e+00   1.105 0.269219    
## BsmtUnfSF      8.826e+00  4.200e+00   2.101 0.035784 *  
## TotalBsmtSF           NA         NA      NA       NA    
## X1stFlrSF      4.798e+01  5.802e+00   8.270 3.04e-16 ***
## X2ndFlrSF      4.834e+01  4.966e+00   9.733  < 2e-16 ***
## LowQualFinSF   3.283e+01  1.984e+01   1.655 0.098182 .  
## GrLivArea             NA         NA      NA       NA    
## BsmtFullBath   9.126e+03  2.612e+03   3.494 0.000491 ***
## BsmtHalfBath   1.575e+03  4.090e+03   0.385 0.700200    
## FullBath       4.131e+03  2.821e+03   1.465 0.143248    
## HalfBath      -1.780e+03  2.664e+03  -0.668 0.504118    
## BedroomAbvGr  -1.034e+04  1.697e+03  -6.092 1.43e-09 ***
## KitchenAbvGr  -1.304e+04  5.214e+03  -2.501 0.012480 *  
## TotRmsAbvGrd   5.152e+03  1.238e+03   4.163 3.33e-05 ***
## Fireplaces     3.705e+03  1.779e+03   2.083 0.037436 *  
## GarageYrBlt   -9.454e+01  7.271e+01  -1.300 0.193757    
## GarageCars     1.091e+04  2.854e+03   3.823 0.000137 ***
## GarageArea     2.491e+00  1.006e+01   0.248 0.804405    
## WoodDeckSF     2.733e+01  8.002e+00   3.416 0.000654 ***
## EnclosedPorch  1.170e+01  1.685e+01   0.694 0.487588    
## X3SsnPorch     1.969e+01  3.139e+01   0.627 0.530513    
## ScreenPorch    5.488e+01  1.719e+01   3.193 0.001438 ** 
## PoolArea      -3.187e+01  2.366e+01  -1.347 0.178165    
## MiscVal       -4.131e-01  1.860e+00  -0.222 0.824249    
## MoSold        -1.032e+02  3.447e+02  -0.299 0.764719    
## YrSold        -7.347e+02  7.027e+02  -1.046 0.295944    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34740 on 1424 degrees of freedom
## Multiple R-squared:  0.8134, Adjusted R-squared:  0.8088 
## F-statistic: 177.4 on 35 and 1424 DF,  p-value: < 2.2e-16

This model overall produced statistically significant results for overall predictor variables, but I want to see how this changes when I remove some NA rows and some high pvalue predictors.

train_drop_1 <- train_df[, !(names(train_df) %in% c("Id", "GrLivArea", "TotalBsmtSF", "BsmtFinSF2", "MiscVal", "X3SsnPorch", "EnclosedPorch"))]


model_3 <- lm(SalePrice~.,train_drop_1)
summary(model_3)
## 
## Call:
## lm(formula = SalePrice ~ ., data = train_drop_1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -470204  -16157   -2085   13914  301795 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.214e+05  1.410e+06   0.370 0.711552    
## MSSubClass   -1.740e+02  2.639e+01  -6.595 5.97e-11 ***
## LotFrontage  -3.464e+01  1.752e+01  -1.978 0.048173 *  
## LotArea       4.267e-01  1.009e-01   4.229 2.50e-05 ***
## OverallQual   1.744e+04  1.179e+03  14.797  < 2e-16 ***
## OverallCond   4.340e+03  1.017e+03   4.267 2.11e-05 ***
## YearBuilt     3.601e+02  6.509e+01   5.532 3.76e-08 ***
## YearRemodAdd  1.795e+02  6.811e+01   2.636 0.008491 ** 
## MasVnrArea    2.965e+01  5.612e+00   5.284 1.46e-07 ***
## BsmtFinSF1    1.595e+01  3.947e+00   4.042 5.59e-05 ***
## BsmtUnfSF     6.614e+00  3.666e+00   1.804 0.071366 .  
## X1stFlrSF     5.051e+01  5.406e+00   9.343  < 2e-16 ***
## X2ndFlrSF     4.858e+01  4.949e+00   9.816  < 2e-16 ***
## LowQualFinSF  3.348e+01  1.977e+01   1.693 0.090579 .  
## BsmtFullBath  9.904e+03  2.537e+03   3.904 9.92e-05 ***
## BsmtHalfBath  2.263e+03  4.057e+03   0.558 0.577115    
## FullBath      4.159e+03  2.815e+03   1.478 0.139723    
## HalfBath     -1.690e+03  2.657e+03  -0.636 0.524822    
## BedroomAbvGr -1.033e+04  1.694e+03  -6.099 1.37e-09 ***
## KitchenAbvGr -1.357e+04  5.177e+03  -2.621 0.008860 ** 
## TotRmsAbvGrd  4.993e+03  1.231e+03   4.056 5.26e-05 ***
## Fireplaces    3.633e+03  1.775e+03   2.047 0.040817 *  
## GarageYrBlt  -9.660e+01  7.259e+01  -1.331 0.183526    
## GarageCars    1.078e+04  2.846e+03   3.789 0.000158 ***
## GarageArea    2.885e+00  1.004e+01   0.287 0.773830    
## WoodDeckSF    2.710e+01  7.935e+00   3.415 0.000655 ***
## OpenPorchSF  -3.472e-01  1.508e+01  -0.023 0.981639    
## ScreenPorch   5.424e+01  1.693e+01   3.203 0.001390 ** 
## PoolArea     -3.098e+01  2.354e+01  -1.316 0.188385    
## MoSold       -1.177e+02  3.437e+02  -0.343 0.732001    
## YrSold       -7.229e+02  7.016e+02  -1.030 0.302995    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34700 on 1429 degrees of freedom
## Multiple R-squared:  0.8131, Adjusted R-squared:  0.8092 
## F-statistic: 207.2 on 30 and 1429 DF,  p-value: < 2.2e-16

Since the predictive power of this model did not drop significantly after removing * variables, so I am going to remove the rest of the non-significant variables and see where the model is at following this move.

train_drop_2 <- train_drop_1[, !(names(train_drop_1) %in% c("BsmtHalfBath","FullBath","HalfBath","GarageYrBlt","GarageArea","OpenPorchSF","PoolArea","MoSold","YrSold"))]


model_4 <- lm(SalePrice~.,train_drop_2)
summary(model_4)
## 
## Call:
## lm(formula = SalePrice ~ ., data = train_drop_2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -488591  -16531   -2181   13718  287300 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.050e+06  1.162e+05  -9.032  < 2e-16 ***
## MSSubClass   -1.707e+02  2.607e+01  -6.548 8.08e-11 ***
## LotFrontage  -3.748e+01  1.740e+01  -2.154 0.031373 *  
## LotArea       4.467e-01  1.005e-01   4.445 9.45e-06 ***
## OverallQual   1.755e+04  1.169e+03  15.011  < 2e-16 ***
## OverallCond   4.392e+03  1.007e+03   4.362 1.38e-05 ***
## YearBuilt     3.331e+02  5.268e+01   6.324 3.39e-10 ***
## YearRemodAdd  1.704e+02  6.481e+01   2.630 0.008637 ** 
## MasVnrArea    2.966e+01  5.567e+00   5.327 1.16e-07 ***
## BsmtFinSF1    1.575e+01  3.898e+00   4.039 5.64e-05 ***
## BsmtUnfSF     6.373e+00  3.632e+00   1.754 0.079575 .  
## X1stFlrSF     5.242e+01  5.111e+00  10.256  < 2e-16 ***
## X2ndFlrSF     4.837e+01  4.120e+00  11.741  < 2e-16 ***
## LowQualFinSF  3.112e+01  1.958e+01   1.589 0.112328    
## BsmtFullBath  8.980e+03  2.396e+03   3.748 0.000185 ***
## BedroomAbvGr -9.727e+03  1.659e+03  -5.863 5.64e-09 ***
## KitchenAbvGr -1.215e+04  5.082e+03  -2.392 0.016899 *  
## TotRmsAbvGrd  5.052e+03  1.223e+03   4.129 3.85e-05 ***
## Fireplaces    3.774e+03  1.745e+03   2.163 0.030741 *  
## GarageCars    1.084e+04  1.689e+03   6.415 1.91e-10 ***
## WoodDeckSF    2.577e+01  7.864e+00   3.277 0.001075 ** 
## ScreenPorch   5.173e+01  1.685e+01   3.071 0.002177 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34690 on 1438 degrees of freedom
## Multiple R-squared:  0.812,  Adjusted R-squared:  0.8093 
## F-statistic: 295.8 on 21 and 1438 DF,  p-value: < 2.2e-16

Lastly, removing the last two variables that are not statistically significant

train_drop_3 <- train_drop_2[, !(names(train_drop_2) %in% c("BsmtUnfSF","LowQualFinSF"))]


model_5 <- lm(SalePrice~.,train_drop_3)
summary(model_5)
## 
## Call:
## lm(formula = SalePrice ~ ., data = train_drop_3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -479913  -16324   -2094   14005  287949 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.046e+06  1.155e+05  -9.058  < 2e-16 ***
## MSSubClass   -1.709e+02  2.590e+01  -6.597 5.86e-11 ***
## LotFrontage  -3.839e+01  1.737e+01  -2.211 0.027221 *  
## LotArea       4.478e-01  1.006e-01   4.451 9.19e-06 ***
## OverallQual   1.813e+04  1.137e+03  15.954  < 2e-16 ***
## OverallCond   4.080e+03  9.996e+02   4.082 4.72e-05 ***
## YearBuilt     3.207e+02  5.194e+01   6.174 8.64e-10 ***
## YearRemodAdd  1.809e+02  6.474e+01   2.795 0.005266 ** 
## MasVnrArea    3.003e+01  5.556e+00   5.405 7.59e-08 ***
## BsmtFinSF1    1.130e+01  2.972e+00   3.804 0.000148 ***
## X1stFlrSF     5.591e+01  4.621e+00  12.099  < 2e-16 ***
## X2ndFlrSF     4.734e+01  4.100e+00  11.548  < 2e-16 ***
## BsmtFullBath  8.330e+03  2.370e+03   3.515 0.000454 ***
## BedroomAbvGr -9.582e+03  1.659e+03  -5.775 9.44e-09 ***
## KitchenAbvGr -1.318e+04  5.058e+03  -2.605 0.009270 ** 
## TotRmsAbvGrd  5.304e+03  1.210e+03   4.383 1.26e-05 ***
## Fireplaces    3.433e+03  1.741e+03   1.972 0.048850 *  
## GarageCars    1.070e+04  1.689e+03   6.334 3.19e-10 ***
## WoodDeckSF    2.541e+01  7.870e+00   3.229 0.001272 ** 
## ScreenPorch   5.156e+01  1.686e+01   3.058 0.002268 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34740 on 1440 degrees of freedom
## Multiple R-squared:  0.8113, Adjusted R-squared:  0.8088 
## F-statistic: 325.8 on 19 and 1440 DF,  p-value: < 2.2e-16

Removing the last significant variables, the multiple Rsquared and Adjusted Rsquared did not budge.

par(mfrow=c(2,2))
plot(model_5)

Lastly, lets predict the test set with this model

#Ensure the test df has the variables used in the final linear regression from above
keep_cols <- c("Id","MSSubClass","LotFrontage","LotArea","OverallQual","OverallCond", 
"YearBuilt","YearRemodAdd","MasVnrArea","BsmtFinSF1","X1stFlrSF",   
"X2ndFlrSF","BsmtFullBath","BedroomAbvGr","KitchenAbvGr","TotRmsAbvGrd",
"Fireplaces","GarageCars","WoodDeckSF","ScreenPorch")
test_pred <- test_df[, (names(test_df) %in% keep_cols)]
predictions <- predict(model_5, newdata = test_pred)

test_pred$SalePrice<-predictions

#Pull out only Id and Sales price from the prediction df
test_pred <- test_pred %>% 
    dplyr::select(Id,SalePrice)

Write out an excel

write.csv(test_pred, "C:\\Users\\jashb\\OneDrive\\Documents\\Masters Data Science\\Spring 2024\\Fundamentals of Computational Mathematics DATA 605\\FINAL_PROJECT\\saleprice_Jburns.csv")

Kaggle Info

Kaggle Username: jonathanburns22 | Score:0.22866

Thank you for a great semester Dr. Larry, see you in DATA 604 this summer!