library(tidyverse)
library(moments)
library(ggcorrplot)
library(reshape2)
library(naniar)
library(corrplot)
library(DescTools)
Note I just downloaded the data from Kaggle and uploaded it to my GitHub, then pulled the data in that way
test<-read.csv("https://raw.githubusercontent.com/jonburns2454/DATA-605/main/final_data/test.csv")
train<-read.csv("https://raw.githubusercontent.com/jonburns2454/DATA-605/main/final_data/train.csv")
summary(train)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 Length:1460 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 Class :character 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 Mode :character Median : 69.00
## Mean : 730.5 Mean : 56.9 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape
## Min. : 1300 Length:1460 Length:1460 Length:1460
## 1st Qu.: 7554 Class :character Class :character Class :character
## Median : 9478 Mode :character Mode :character Mode :character
## Mean : 10517
## 3rd Qu.: 11602
## Max. :215245
##
## LandContour Utilities LotConfig LandSlope
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Neighborhood Condition1 Condition2 BldgType
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## HouseStyle OverallQual OverallCond YearBuilt
## Length:1460 Min. : 1.000 Min. :1.000 Min. :1872
## Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
## Mode :character Median : 6.000 Median :5.000 Median :1973
## Mean : 6.099 Mean :5.575 Mean :1971
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
## Max. :10.000 Max. :9.000 Max. :2010
##
## YearRemodAdd RoofStyle RoofMatl Exterior1st
## Min. :1950 Length:1460 Length:1460 Length:1460
## 1st Qu.:1967 Class :character Class :character Class :character
## Median :1994 Mode :character Mode :character Mode :character
## Mean :1985
## 3rd Qu.:2004
## Max. :2010
##
## Exterior2nd MasVnrType MasVnrArea ExterQual
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 0.0 Mode :character
## Mean : 103.7
## 3rd Qu.: 166.0
## Max. :1600.0
## NA's :8
## ExterCond Foundation BsmtQual BsmtCond
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 383.5 Mode :character
## Mean : 443.6
## 3rd Qu.: 712.2
## Max. :5644.0
##
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Length:1460
## 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 Class :character
## Median : 0.00 Median : 477.5 Median : 991.5 Mode :character
## Mean : 46.55 Mean : 567.2 Mean :1057.4
## 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2
## Max. :1474.00 Max. :2336.0 Max. :6110.0
##
## HeatingQC CentralAir Electrical X1stFlrSF
## Length:1460 Length:1460 Length:1460 Min. : 334
## Class :character Class :character Class :character 1st Qu.: 882
## Mode :character Mode :character Mode :character Median :1087
## Mean :1163
## 3rd Qu.:1391
## Max. :4692
##
## X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Min. : 0 Min. : 0.000 Min. : 334 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1130 1st Qu.:0.0000
## Median : 0 Median : 0.000 Median :1464 Median :0.0000
## Mean : 347 Mean : 5.845 Mean :1515 Mean :0.4253
## 3rd Qu.: 728 3rd Qu.: 0.000 3rd Qu.:1777 3rd Qu.:1.0000
## Max. :2065 Max. :572.000 Max. :5642 Max. :3.0000
##
## BsmtHalfBath FullBath HalfBath BedroomAbvGr
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.00000 Median :2.000 Median :0.0000 Median :3.000
## Mean :0.05753 Mean :1.565 Mean :0.3829 Mean :2.866
## 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :2.00000 Max. :3.000 Max. :2.0000 Max. :8.000
##
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.000 Length:1460 Min. : 2.000 Length:1460
## 1st Qu.:1.000 Class :character 1st Qu.: 5.000 Class :character
## Median :1.000 Mode :character Median : 6.000 Mode :character
## Mean :1.047 Mean : 6.518
## 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :3.000 Max. :14.000
##
## Fireplaces FireplaceQu GarageType GarageYrBlt
## Min. :0.000 Length:1460 Length:1460 Min. :1900
## 1st Qu.:0.000 Class :character Class :character 1st Qu.:1961
## Median :1.000 Mode :character Mode :character Median :1980
## Mean :0.613 Mean :1979
## 3rd Qu.:1.000 3rd Qu.:2002
## Max. :3.000 Max. :2010
## NA's :81
## GarageFinish GarageCars GarageArea GarageQual
## Length:1460 Min. :0.000 Min. : 0.0 Length:1460
## Class :character 1st Qu.:1.000 1st Qu.: 334.5 Class :character
## Mode :character Median :2.000 Median : 480.0 Mode :character
## Mean :1.767 Mean : 473.0
## 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :4.000 Max. :1418.0
##
## GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Length:1460 Length:1460 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.00 Median : 25.00
## Mean : 94.24 Mean : 46.66
## 3rd Qu.:168.00 3rd Qu.: 68.00
## Max. :857.00 Max. :547.00
##
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.00 Median : 0.00 Median : 0.000
## Mean : 21.95 Mean : 3.41 Mean : 15.06 Mean : 2.759
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :552.00 Max. :508.00 Max. :480.00 Max. :738.000
##
## PoolQC Fence MiscFeature MiscVal
## Length:1460 Length:1460 Length:1460 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Mode :character Median : 0.00
## Mean : 43.49
## 3rd Qu.: 0.00
## Max. :15500.00
##
## MoSold YrSold SaleType SaleCondition
## Min. : 1.000 Min. :2006 Length:1460 Length:1460
## 1st Qu.: 5.000 1st Qu.:2007 Class :character Class :character
## Median : 6.000 Median :2008 Mode :character Mode :character
## Mean : 6.322 Mean :2008
## 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :12.000 Max. :2010
##
## SalePrice
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
##
Pick one of the quantitative independent variables from the training data set (train.csv) , and define that variable as X. Make sure this variable is skewed to the right! Pick the dependent variable and define it as Y.
Sifting through the data the OpenPorchSF variable seems to fit the task:
ggplot(train, aes(x = train$OpenPorchSF))+
geom_histogram()
## Warning: Use of `train$OpenPorchSF` is discouraged.
## ℹ Use `OpenPorchSF` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
A score over one should be more than enough to prove a variables
skeweness.
skewness(train$OpenPorchSF)
## [1] 2.361912
X <- train$OpenPorchSF
Y <- train$SalePrice
Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 3d quartile of the X variable, and the small letter “y” is estimated as the 2d quartile of the Y variable. Interpret the meaning of all probabilities. In addition, make a table of counts as shown below.
x <- quantile(X,.75)
y <- quantile(Y, 0.50)
a <- sum(X > x & Y > y)/sum(Y>y)
b <- sum(X > x & Y > y)/nrow(train)
c <- sum(X < x & Y > y)/sum(Y>y)
table_of_counts <- table(X > x, Y > y)
print(table_of_counts)
##
## FALSE TRUE
## FALSE 637 461
## TRUE 95 267
colnames(table_of_counts) <- c("<=2d quartile", ">2d quartile")
rownames(table_of_counts) <- c("<=3d quartile", ">3d quartile")
table_of_counts <- addmargins(table_of_counts)
print(table_of_counts)
##
## <=2d quartile >2d quartile Sum
## <=3d quartile 637 461 1098
## >3d quartile 95 267 362
## Sum 732 728 1460
Interpret the meanings of the probabilities: a. P(X>x |
Y>y)
The probability of X being greater than x (the third quantile) given
that Y is greater than y (the second quantile). In context of the data
itself, it means the probability that OpenPorchSF is greater than the
third quantile, when the SalePrice is greater than the second
percentile. A total probability of around .366758.
P(X>x, Y>y)
The probability of both OpenPorchSF being greater than the 3rd quantile
and the SalePrice being greater than the second quantile is
0.182876
P(X<x | Y>y) Lastly, the probability of OpenPorchSF being less than the third quartile given that SalePrice is above the second quartile is 0.625
Does splitting the training data in this fashion make them independent? Let A be the new variable counting those observations above the 3d quartile for X, and let B be the new variable counting those observations above the 2d quartile for Y. Does P(A|B)=P(A)P(B)? Check mathematically, and then evaluate by running a Chi Square test for association.
Define A and B and calculate the probabilities for P(A|B)=P(A)P(B)
A <- as.numeric(X > x)
B <- as.numeric(Y > y)
#P(A|B)
P_A_B <- sum(A & B)/sum(B)
# P(A) and P(B)
P_A <- sum(A) / length(A)
P_B <- sum(B) / length(B)
# Conditional joint probabilities
P_A_P_B <- P_A * P_B
print(paste("The probability of P(A|B) is:", P_A_B, ", while the probability of P(A)P(B) is:", P_A_P_B))
## [1] "The probability of P(A|B) is: 0.366758241758242 , while the probability of P(A)P(B) is: 0.123632951773316"
Lastly, run a Chi-square test to validate this finding.
chi_a_b <- chisq.test(A,B)
print(chi_a_b)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: A and B
## X-squared = 108.66, df = 1, p-value < 2.2e-16
The null hypothesis in this instance contends that OpenAreaSF and SalePrice variables are independent from one another. The Chi-Square produced a very low pvalue (2.2e-16) and a high X-squared, indicating that we should reject the null, suggesting a dependent relationship between OpenAreaSF and SalePrice.
Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot of X and Y. Provide a 95% CI for the difference in the mean of the variables. Derive a correlation matrix for two of the quantitative variables you selected. Test the hypothesis that the correlation between these variables is 0 and provide a 99% confidence interval. Discuss the meaning of your analysis.
summary(X)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 25.00 46.66 68.00 547.00
summary(Y)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34900 129975 163000 180921 214000 755000
missing_train <- train %>%
summarise_all(~ sum(is.na(.))) %>%
pivot_longer(cols = everything(), names_to = "variable", values_to = "missing_count") %>%
filter(missing_count>0)
# Its important to not that these are the only variables with missingness.
ggplot(missing_train, aes(x = reorder(variable, -missing_count), y = missing_count)) +
geom_bar(stat = "identity", fill = "skyblue") +
coord_flip() +
labs(title = "Count of Missing Values by Variable",
x = "Variable",
y = "Count of Missing Values") +
theme_minimal() +
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 14),
plot.title = element_text(size = 16, face = "bold"))
The missingness will eventually be a problem, so this training df will
require some form of imputation down the road.
train %>%
keep(is.numeric) %>%
gather(key="variable", value = "value") %>%
ggplot(aes(x=variable, y=value))+
geom_violin()+
facet_wrap(~variable, scales = 'free')+
theme(strip.text = element_text(size=6))+
theme(axis.text.y = element_text(size = 5))
## Warning: Removed 348 rows containing non-finite values (`stat_ydensity()`).
train %>%
ggplot(aes(x=OpenPorchSF, y = SalePrice))+
geom_point()+
labs(x = "OpenPorchSF(X)", y = "SalesPrice(Y)", title = "OpenPorchSF and SalesPrice Viz")+
geom_smooth(method="lm", se=FALSE)
## `geom_smooth()` using formula = 'y ~ x'
Provide a 95% CI for the difference in the mean of the variables.
# Ill use a t-test for this
t.test(x = train$OpenPorchSF,y = train$SalePrice, conf.level = 0.95)
##
## Welch Two Sample t-test
##
## data: train$OpenPorchSF and train$SalePrice
## t = -86.996, df = 1459, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -184952.9 -176796.2
## sample estimates:
## mean of x mean of y
## 46.66027 180921.19589
Test the hypothesis that the correlation between these variables is 0 and provide a 99% confidence interval. Discuss the meaning of your analysis.
cor_test_1 <- cor.test(train$OpenPorchSF, train$SalePrice, method = "pearson", conf.level = 0.99)
print(cor_test_1)
##
## Pearson's product-moment correlation
##
## data: train$OpenPorchSF and train$SalePrice
## t = 12.711, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
## 0.2538797 0.3752497
## sample estimates:
## cor
## 0.3158562
cor_test_2 <- cor.test(train$OpenPorchSF, train$SalePrice, method = "kendall", conf.level = 0.99)
print(cor_test_2)
##
## Kendall's rank correlation tau
##
## data: train$OpenPorchSF and train$SalePrice
## z = 18.724, p-value < 2.2e-16
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## 0.350161
I wanted to compare two different correlation methods, Pearson and Kendall. Both methods yielded statistically significant result with a low pvalue and a high z value. However, Kendall is used when evaluating data that is not normally distributed so the Kendall cor.test is more promising than Pearson.
Test the hypothesis that the correlation between these variables is 0 and provide a 99% confidence interval. Discuss the meaning of your analysis.
In terms of the question, the Kendall Correlation Test reveals that the null hypothesis is rejected, backed by the very low pvalue, the relationship between Open Porch Square footage and Sales Price is statistically significant.
t.test(train$OpenPorchSF,train$SalePrice, conf.level = 0.99)
##
## Welch Two Sample t-test
##
## data: train$OpenPorchSF and train$SalePrice
## t = -86.996, df = 1459, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 99 percent confidence interval:
## -186237.0 -175512.1
## sample estimates:
## mean of x mean of y
## 46.66027 180921.19589
This confidence interval indicates that there is 99% confidence that the that the difference in means between OpenPorchSF and SalePrice is between -186237.0 -175512.1.
###Invert your correlation matrix. (This is known as the precision matrix and contains variance inflation factors on the diagonal.)
cor_X_Y <- cor(train[,c("OpenPorchSF","SalePrice")])
print(cor_X_Y)
## OpenPorchSF SalePrice
## OpenPorchSF 1.0000000 0.3158562
## SalePrice 0.3158562 1.0000000
precision_mat <- solve(cor_X_Y)
print(precision_mat)
## OpenPorchSF SalePrice
## OpenPorchSF 1.1108213 -0.3508598
## SalePrice -0.3508598 1.1108213
cor_X_Y * precision_mat
## OpenPorchSF SalePrice
## OpenPorchSF 1.1108213 -0.1108213
## SalePrice -0.1108213 1.1108213
precision_mat * cor_X_Y
## OpenPorchSF SalePrice
## OpenPorchSF 1.1108213 -0.1108213
## SalePrice -0.1108213 1.1108213
Conduct principle components analysis (research this!) and interpret. Discuss.
pca_results <- prcomp(train[,c("OpenPorchSF","SalePrice")], scale = TRUE)
summary(pca_results)
## Importance of components:
## PC1 PC2
## Standard deviation 1.1471 0.8271
## Proportion of Variance 0.6579 0.3421
## Cumulative Proportion 0.6579 1.0000
Vizualizing the PCA Analysis:
library("factoextra")
fviz_pca_ind(pca_results, col.ind = "coral3")
fviz_eig(pca_results, addlabels = T)
The summary of the principal component analysis captures the amount of variance that is explained by each principal component. Captured in both the summary(PCA_results) and the scree plot, dimension 1 covers 65.8% of the variance in the bivariate dataset and dimension 2 captures 34.2% of this variance. These components capture all of the variance within the dataset, however it is important to note that there are only two dimensions in this df because there are only two variables present in the PCA reduction (OpenPorchSF, SalePrice).
Many times, it makes sense to fit a closed form distribution to data. For your variable that is skewed to the right, shift it so that the minimum value is above zero. Then load the MASS package and run fitdistr to fit an exponential probability density function.(See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of \(\lambda\) for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, \(\lambda\))). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.
I would say that having a negative value is not possible for these two variables to be negative, since a house can have negative value, nor negative square footage of an open porch. Nonetheless, OpenPorchSF needs to be adjusted so there are no 0 values.
summary(train$OpenPorchSF)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 25.00 46.66 68.00 547.00
summary(train$SalePrice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34900 129975 163000 180921 214000 755000
shift_X <- train$OpenPorchSF - min(train$OpenPorchSF)+1
library(MASS)
library(gridExtra)
Find the optimal value of \(\lambda\) for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, \(\lambda\))).
fit_X <- fitdistr(shift_X, densfun = "exponential")
lambda<-fit_X$estimate
sample<- rexp(1000, rate = lambda)
Plot a histogram and compare it with a histogram of your original variable.
par(mfrow = c(1,2))
hist(train$OpenPorchSF, main=' Original Open PorchSF', xlab = 'Square Footage', breaks = 30)
hist(sample, main = "Exponsntial Distribution Sample", xlab = "Sample Square Footage", col = 'blue', breaks = 30)
Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF).
perc_5 <- qexp(0.05, rate = lambda)
perc_95 <- qexp(0.95, rate = lambda)
print(paste(perc_5,perc_95))
## [1] "2.44465246346802 142.777420906151"
Also generate a 95% confidence interval from the empirical data, assuming normality.
mu <- mean(train$OpenPorchSF)
Std_Error <- sd(train$OpenPorchSF)/sqrt(nrow(train))
Marg_Error <- qt(0.975, df = nrow(train)-1)*Std_Error
confidence_int <-c(mu - Marg_Error, mu + Marg_Error)
print(confidence_int)
## [1] 43.25888 50.06167
The results from the CI for the empirical data tells me that I can be 95% confident that the mean of the original Open Porch square footage variable is within 43.25 SF and 50.06 SF.
This can also be verified with a t.test.The CIs match meaning my calculations from above can be trusted.
CI <- t.test(train$OpenPorchSF)
print(CI)
##
## One Sample t-test
##
## data: train$OpenPorchSF
## t = 26.909, df = 1459, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 43.25888 50.06167
## sample estimates:
## mean of x
## 46.66027
Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.
perc_95 <- quantile(train$OpenPorchSF,probs=0.95)
perc_5 <- quantile(train$OpenPorchSF,probs=0.05)
print(paste("The 5th percentile:",perc_5,", The 95th percentile:", perc_95))
## [1] "The 5th percentile: 0 , The 95th percentile: 175.05"
This can be interpreted as 90% of the data is distributed between the intervals of 0 and 175.05 square feet. Looking back at the summary(train$OpenPorchSF) the data is so clumped around zero that the min and 1st. Quantile were both zero, so a 5th percentile makes sense given the context.
summary(train$OpenPorchSF)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 25.00 46.66 68.00 547.00
Build some type of regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.
Just from a brief look at the test and train datasets earlier I know that there will a be a problem with missing values. Due to this I will try an imputation technique.
Check NA’s
train %>%
summarize_all(~sum(is.na(.))) %>%
glimpse()
## Rows: 1
## Columns: 81
## $ Id <int> 0
## $ MSSubClass <int> 0
## $ MSZoning <int> 0
## $ LotFrontage <int> 259
## $ LotArea <int> 0
## $ Street <int> 0
## $ Alley <int> 1369
## $ LotShape <int> 0
## $ LandContour <int> 0
## $ Utilities <int> 0
## $ LotConfig <int> 0
## $ LandSlope <int> 0
## $ Neighborhood <int> 0
## $ Condition1 <int> 0
## $ Condition2 <int> 0
## $ BldgType <int> 0
## $ HouseStyle <int> 0
## $ OverallQual <int> 0
## $ OverallCond <int> 0
## $ YearBuilt <int> 0
## $ YearRemodAdd <int> 0
## $ RoofStyle <int> 0
## $ RoofMatl <int> 0
## $ Exterior1st <int> 0
## $ Exterior2nd <int> 0
## $ MasVnrType <int> 8
## $ MasVnrArea <int> 8
## $ ExterQual <int> 0
## $ ExterCond <int> 0
## $ Foundation <int> 0
## $ BsmtQual <int> 37
## $ BsmtCond <int> 37
## $ BsmtExposure <int> 38
## $ BsmtFinType1 <int> 37
## $ BsmtFinSF1 <int> 0
## $ BsmtFinType2 <int> 38
## $ BsmtFinSF2 <int> 0
## $ BsmtUnfSF <int> 0
## $ TotalBsmtSF <int> 0
## $ Heating <int> 0
## $ HeatingQC <int> 0
## $ CentralAir <int> 0
## $ Electrical <int> 1
## $ X1stFlrSF <int> 0
## $ X2ndFlrSF <int> 0
## $ LowQualFinSF <int> 0
## $ GrLivArea <int> 0
## $ BsmtFullBath <int> 0
## $ BsmtHalfBath <int> 0
## $ FullBath <int> 0
## $ HalfBath <int> 0
## $ BedroomAbvGr <int> 0
## $ KitchenAbvGr <int> 0
## $ KitchenQual <int> 0
## $ TotRmsAbvGrd <int> 0
## $ Functional <int> 0
## $ Fireplaces <int> 0
## $ FireplaceQu <int> 690
## $ GarageType <int> 81
## $ GarageYrBlt <int> 81
## $ GarageFinish <int> 81
## $ GarageCars <int> 0
## $ GarageArea <int> 0
## $ GarageQual <int> 81
## $ GarageCond <int> 81
## $ PavedDrive <int> 0
## $ WoodDeckSF <int> 0
## $ OpenPorchSF <int> 0
## $ EnclosedPorch <int> 0
## $ X3SsnPorch <int> 0
## $ ScreenPorch <int> 0
## $ PoolArea <int> 0
## $ PoolQC <int> 1453
## $ Fence <int> 1179
## $ MiscFeature <int> 1406
## $ MiscVal <int> 0
## $ MoSold <int> 0
## $ YrSold <int> 0
## $ SaleType <int> 0
## $ SaleCondition <int> 0
## $ SalePrice <int> 0
test %>%
summarize_all(~sum(is.na(.))) %>%
glimpse()
## Rows: 1
## Columns: 80
## $ Id <int> 0
## $ MSSubClass <int> 0
## $ MSZoning <int> 4
## $ LotFrontage <int> 227
## $ LotArea <int> 0
## $ Street <int> 0
## $ Alley <int> 1352
## $ LotShape <int> 0
## $ LandContour <int> 0
## $ Utilities <int> 2
## $ LotConfig <int> 0
## $ LandSlope <int> 0
## $ Neighborhood <int> 0
## $ Condition1 <int> 0
## $ Condition2 <int> 0
## $ BldgType <int> 0
## $ HouseStyle <int> 0
## $ OverallQual <int> 0
## $ OverallCond <int> 0
## $ YearBuilt <int> 0
## $ YearRemodAdd <int> 0
## $ RoofStyle <int> 0
## $ RoofMatl <int> 0
## $ Exterior1st <int> 1
## $ Exterior2nd <int> 1
## $ MasVnrType <int> 16
## $ MasVnrArea <int> 15
## $ ExterQual <int> 0
## $ ExterCond <int> 0
## $ Foundation <int> 0
## $ BsmtQual <int> 44
## $ BsmtCond <int> 45
## $ BsmtExposure <int> 44
## $ BsmtFinType1 <int> 42
## $ BsmtFinSF1 <int> 1
## $ BsmtFinType2 <int> 42
## $ BsmtFinSF2 <int> 1
## $ BsmtUnfSF <int> 1
## $ TotalBsmtSF <int> 1
## $ Heating <int> 0
## $ HeatingQC <int> 0
## $ CentralAir <int> 0
## $ Electrical <int> 0
## $ X1stFlrSF <int> 0
## $ X2ndFlrSF <int> 0
## $ LowQualFinSF <int> 0
## $ GrLivArea <int> 0
## $ BsmtFullBath <int> 2
## $ BsmtHalfBath <int> 2
## $ FullBath <int> 0
## $ HalfBath <int> 0
## $ BedroomAbvGr <int> 0
## $ KitchenAbvGr <int> 0
## $ KitchenQual <int> 1
## $ TotRmsAbvGrd <int> 0
## $ Functional <int> 2
## $ Fireplaces <int> 0
## $ FireplaceQu <int> 730
## $ GarageType <int> 76
## $ GarageYrBlt <int> 78
## $ GarageFinish <int> 78
## $ GarageCars <int> 1
## $ GarageArea <int> 1
## $ GarageQual <int> 78
## $ GarageCond <int> 78
## $ PavedDrive <int> 0
## $ WoodDeckSF <int> 0
## $ OpenPorchSF <int> 0
## $ EnclosedPorch <int> 0
## $ X3SsnPorch <int> 0
## $ ScreenPorch <int> 0
## $ PoolArea <int> 0
## $ PoolQC <int> 1456
## $ Fence <int> 1169
## $ MiscFeature <int> 1408
## $ MiscVal <int> 0
## $ MoSold <int> 0
## $ YrSold <int> 0
## $ SaleType <int> 1
## $ SaleCondition <int> 0
Select only the numeric rows:
num_train <- train %>%
select_if(is.numeric)
num_test <- test %>%
select_if(is.numeric)
Impute:
library(mice)
set.seed(1999)
impute_train <- mice(num_train, m=1, method = 'pmm', maxit = 5)
##
## iter imp variable
## 1 1 LotFrontage* MasVnrArea* GarageYrBlt*
## 2 1 LotFrontage* MasVnrArea* GarageYrBlt*
## 3 1 LotFrontage* MasVnrArea* GarageYrBlt*
## 4 1 LotFrontage* MasVnrArea* GarageYrBlt*
## 5 1 LotFrontage* MasVnrArea* GarageYrBlt*
comp_im_train <- complete(impute_train)
impute_test <- mice(num_test, m=1, method = 'pmm', maxit = 5)
##
## iter imp variable
## 1 1 LotFrontage* MasVnrArea* BsmtFinSF1* BsmtFinSF2* BsmtUnfSF* TotalBsmtSF* BsmtFullBath* BsmtHalfBath* GarageYrBlt* GarageCars* GarageArea*
## 2 1 LotFrontage* MasVnrArea* BsmtFinSF1* BsmtFinSF2* BsmtUnfSF* TotalBsmtSF* BsmtFullBath* BsmtHalfBath* GarageYrBlt* GarageCars* GarageArea*
## 3 1 LotFrontage* MasVnrArea* BsmtFinSF1* BsmtFinSF2* BsmtUnfSF* TotalBsmtSF* BsmtFullBath* BsmtHalfBath* GarageYrBlt* GarageCars* GarageArea*
## 4 1 LotFrontage* MasVnrArea* BsmtFinSF1* BsmtFinSF2* BsmtUnfSF* TotalBsmtSF* BsmtFullBath* BsmtHalfBath* GarageYrBlt* GarageCars* GarageArea*
## 5 1 LotFrontage* MasVnrArea* BsmtFinSF1* BsmtFinSF2* BsmtUnfSF* TotalBsmtSF* BsmtFullBath* BsmtHalfBath* GarageYrBlt* GarageCars* GarageArea*
comp_im_test <- complete(impute_test)
Reset names:
train_df <-comp_im_train
test_df <-comp_im_test
Check missing again
train_df %>%
summarize_all(~sum(is.na(.))) %>%
glimpse()
## Rows: 1
## Columns: 38
## $ Id <int> 0
## $ MSSubClass <int> 0
## $ LotFrontage <int> 0
## $ LotArea <int> 0
## $ OverallQual <int> 0
## $ OverallCond <int> 0
## $ YearBuilt <int> 0
## $ YearRemodAdd <int> 0
## $ MasVnrArea <int> 0
## $ BsmtFinSF1 <int> 0
## $ BsmtFinSF2 <int> 0
## $ BsmtUnfSF <int> 0
## $ TotalBsmtSF <int> 0
## $ X1stFlrSF <int> 0
## $ X2ndFlrSF <int> 0
## $ LowQualFinSF <int> 0
## $ GrLivArea <int> 0
## $ BsmtFullBath <int> 0
## $ BsmtHalfBath <int> 0
## $ FullBath <int> 0
## $ HalfBath <int> 0
## $ BedroomAbvGr <int> 0
## $ KitchenAbvGr <int> 0
## $ TotRmsAbvGrd <int> 0
## $ Fireplaces <int> 0
## $ GarageYrBlt <int> 0
## $ GarageCars <int> 0
## $ GarageArea <int> 0
## $ WoodDeckSF <int> 0
## $ OpenPorchSF <int> 0
## $ EnclosedPorch <int> 0
## $ X3SsnPorch <int> 0
## $ ScreenPorch <int> 0
## $ PoolArea <int> 0
## $ MiscVal <int> 0
## $ MoSold <int> 0
## $ YrSold <int> 0
## $ SalePrice <int> 0
test_df %>%
summarize_all(~sum(is.na(.))) %>%
glimpse()
## Rows: 1
## Columns: 37
## $ Id <int> 0
## $ MSSubClass <int> 0
## $ LotFrontage <int> 0
## $ LotArea <int> 0
## $ OverallQual <int> 0
## $ OverallCond <int> 0
## $ YearBuilt <int> 0
## $ YearRemodAdd <int> 0
## $ MasVnrArea <int> 0
## $ BsmtFinSF1 <int> 0
## $ BsmtFinSF2 <int> 0
## $ BsmtUnfSF <int> 0
## $ TotalBsmtSF <int> 0
## $ X1stFlrSF <int> 0
## $ X2ndFlrSF <int> 0
## $ LowQualFinSF <int> 0
## $ GrLivArea <int> 0
## $ BsmtFullBath <int> 0
## $ BsmtHalfBath <int> 0
## $ FullBath <int> 0
## $ HalfBath <int> 0
## $ BedroomAbvGr <int> 0
## $ KitchenAbvGr <int> 0
## $ TotRmsAbvGrd <int> 0
## $ Fireplaces <int> 0
## $ GarageYrBlt <int> 0
## $ GarageCars <int> 0
## $ GarageArea <int> 0
## $ WoodDeckSF <int> 0
## $ OpenPorchSF <int> 0
## $ EnclosedPorch <int> 0
## $ X3SsnPorch <int> 0
## $ ScreenPorch <int> 0
## $ PoolArea <int> 0
## $ MiscVal <int> 0
## $ MoSold <int> 0
## $ YrSold <int> 0
The data has been imputed properly in the test and train datasets.
model_1 <- lm(SalePrice~OpenPorchSF, train_df)
summary(model_1)
##
## Call:
## lm(formula = SalePrice ~ OpenPorchSF, data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -326420 -43750 -14250 26036 572814
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 163250.07 2413.86 67.63 <2e-16 ***
## OpenPorchSF 378.72 29.79 12.71 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 75400 on 1458 degrees of freedom
## Multiple R-squared: 0.09977, Adjusted R-squared: 0.09915
## F-statistic: 161.6 on 1 and 1458 DF, p-value: < 2.2e-16
The main predictor variable OpenPorchSF is statistically significant. Now to build a regression model for the entire numerical dataset
model_2 <- lm(SalePrice~OpenPorchSF+., train_df)
summary(model_2)
##
## Call:
## lm(formula = SalePrice ~ OpenPorchSF + ., data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -472650 -16212 -2041 13637 303283
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.187e+05 1.413e+06 0.367 0.713546
## OpenPorchSF -1.802e-01 1.516e+01 -0.012 0.990522
## Id -9.036e-01 2.184e+00 -0.414 0.679160
## MSSubClass -1.719e+02 2.646e+01 -6.494 1.15e-10 ***
## LotFrontage -3.484e+01 1.759e+01 -1.981 0.047802 *
## LotArea 4.199e-01 1.015e-01 4.138 3.71e-05 ***
## OverallQual 1.724e+04 1.191e+03 14.482 < 2e-16 ***
## OverallCond 4.416e+03 1.028e+03 4.297 1.85e-05 ***
## YearBuilt 3.697e+02 6.835e+01 5.409 7.43e-08 ***
## YearRemodAdd 1.811e+02 6.836e+01 2.649 0.008167 **
## MasVnrArea 2.977e+01 5.627e+00 5.290 1.41e-07 ***
## BsmtFinSF1 1.877e+01 4.661e+00 4.026 5.96e-05 ***
## BsmtFinSF2 7.797e+00 7.054e+00 1.105 0.269219
## BsmtUnfSF 8.826e+00 4.200e+00 2.101 0.035784 *
## TotalBsmtSF NA NA NA NA
## X1stFlrSF 4.798e+01 5.802e+00 8.270 3.04e-16 ***
## X2ndFlrSF 4.834e+01 4.966e+00 9.733 < 2e-16 ***
## LowQualFinSF 3.283e+01 1.984e+01 1.655 0.098182 .
## GrLivArea NA NA NA NA
## BsmtFullBath 9.126e+03 2.612e+03 3.494 0.000491 ***
## BsmtHalfBath 1.575e+03 4.090e+03 0.385 0.700200
## FullBath 4.131e+03 2.821e+03 1.465 0.143248
## HalfBath -1.780e+03 2.664e+03 -0.668 0.504118
## BedroomAbvGr -1.034e+04 1.697e+03 -6.092 1.43e-09 ***
## KitchenAbvGr -1.304e+04 5.214e+03 -2.501 0.012480 *
## TotRmsAbvGrd 5.152e+03 1.238e+03 4.163 3.33e-05 ***
## Fireplaces 3.705e+03 1.779e+03 2.083 0.037436 *
## GarageYrBlt -9.454e+01 7.271e+01 -1.300 0.193757
## GarageCars 1.091e+04 2.854e+03 3.823 0.000137 ***
## GarageArea 2.491e+00 1.006e+01 0.248 0.804405
## WoodDeckSF 2.733e+01 8.002e+00 3.416 0.000654 ***
## EnclosedPorch 1.170e+01 1.685e+01 0.694 0.487588
## X3SsnPorch 1.969e+01 3.139e+01 0.627 0.530513
## ScreenPorch 5.488e+01 1.719e+01 3.193 0.001438 **
## PoolArea -3.187e+01 2.366e+01 -1.347 0.178165
## MiscVal -4.131e-01 1.860e+00 -0.222 0.824249
## MoSold -1.032e+02 3.447e+02 -0.299 0.764719
## YrSold -7.347e+02 7.027e+02 -1.046 0.295944
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34740 on 1424 degrees of freedom
## Multiple R-squared: 0.8134, Adjusted R-squared: 0.8088
## F-statistic: 177.4 on 35 and 1424 DF, p-value: < 2.2e-16
This model overall produced statistically significant results for overall predictor variables, but I want to see how this changes when I remove some NA rows and some high pvalue predictors.
train_drop_1 <- train_df[, !(names(train_df) %in% c("Id", "GrLivArea", "TotalBsmtSF", "BsmtFinSF2", "MiscVal", "X3SsnPorch", "EnclosedPorch"))]
model_3 <- lm(SalePrice~.,train_drop_1)
summary(model_3)
##
## Call:
## lm(formula = SalePrice ~ ., data = train_drop_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -470204 -16157 -2085 13914 301795
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.214e+05 1.410e+06 0.370 0.711552
## MSSubClass -1.740e+02 2.639e+01 -6.595 5.97e-11 ***
## LotFrontage -3.464e+01 1.752e+01 -1.978 0.048173 *
## LotArea 4.267e-01 1.009e-01 4.229 2.50e-05 ***
## OverallQual 1.744e+04 1.179e+03 14.797 < 2e-16 ***
## OverallCond 4.340e+03 1.017e+03 4.267 2.11e-05 ***
## YearBuilt 3.601e+02 6.509e+01 5.532 3.76e-08 ***
## YearRemodAdd 1.795e+02 6.811e+01 2.636 0.008491 **
## MasVnrArea 2.965e+01 5.612e+00 5.284 1.46e-07 ***
## BsmtFinSF1 1.595e+01 3.947e+00 4.042 5.59e-05 ***
## BsmtUnfSF 6.614e+00 3.666e+00 1.804 0.071366 .
## X1stFlrSF 5.051e+01 5.406e+00 9.343 < 2e-16 ***
## X2ndFlrSF 4.858e+01 4.949e+00 9.816 < 2e-16 ***
## LowQualFinSF 3.348e+01 1.977e+01 1.693 0.090579 .
## BsmtFullBath 9.904e+03 2.537e+03 3.904 9.92e-05 ***
## BsmtHalfBath 2.263e+03 4.057e+03 0.558 0.577115
## FullBath 4.159e+03 2.815e+03 1.478 0.139723
## HalfBath -1.690e+03 2.657e+03 -0.636 0.524822
## BedroomAbvGr -1.033e+04 1.694e+03 -6.099 1.37e-09 ***
## KitchenAbvGr -1.357e+04 5.177e+03 -2.621 0.008860 **
## TotRmsAbvGrd 4.993e+03 1.231e+03 4.056 5.26e-05 ***
## Fireplaces 3.633e+03 1.775e+03 2.047 0.040817 *
## GarageYrBlt -9.660e+01 7.259e+01 -1.331 0.183526
## GarageCars 1.078e+04 2.846e+03 3.789 0.000158 ***
## GarageArea 2.885e+00 1.004e+01 0.287 0.773830
## WoodDeckSF 2.710e+01 7.935e+00 3.415 0.000655 ***
## OpenPorchSF -3.472e-01 1.508e+01 -0.023 0.981639
## ScreenPorch 5.424e+01 1.693e+01 3.203 0.001390 **
## PoolArea -3.098e+01 2.354e+01 -1.316 0.188385
## MoSold -1.177e+02 3.437e+02 -0.343 0.732001
## YrSold -7.229e+02 7.016e+02 -1.030 0.302995
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34700 on 1429 degrees of freedom
## Multiple R-squared: 0.8131, Adjusted R-squared: 0.8092
## F-statistic: 207.2 on 30 and 1429 DF, p-value: < 2.2e-16
Since the predictive power of this model did not drop significantly after removing * variables, so I am going to remove the rest of the non-significant variables and see where the model is at following this move.
train_drop_2 <- train_drop_1[, !(names(train_drop_1) %in% c("BsmtHalfBath","FullBath","HalfBath","GarageYrBlt","GarageArea","OpenPorchSF","PoolArea","MoSold","YrSold"))]
model_4 <- lm(SalePrice~.,train_drop_2)
summary(model_4)
##
## Call:
## lm(formula = SalePrice ~ ., data = train_drop_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -488591 -16531 -2181 13718 287300
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.050e+06 1.162e+05 -9.032 < 2e-16 ***
## MSSubClass -1.707e+02 2.607e+01 -6.548 8.08e-11 ***
## LotFrontage -3.748e+01 1.740e+01 -2.154 0.031373 *
## LotArea 4.467e-01 1.005e-01 4.445 9.45e-06 ***
## OverallQual 1.755e+04 1.169e+03 15.011 < 2e-16 ***
## OverallCond 4.392e+03 1.007e+03 4.362 1.38e-05 ***
## YearBuilt 3.331e+02 5.268e+01 6.324 3.39e-10 ***
## YearRemodAdd 1.704e+02 6.481e+01 2.630 0.008637 **
## MasVnrArea 2.966e+01 5.567e+00 5.327 1.16e-07 ***
## BsmtFinSF1 1.575e+01 3.898e+00 4.039 5.64e-05 ***
## BsmtUnfSF 6.373e+00 3.632e+00 1.754 0.079575 .
## X1stFlrSF 5.242e+01 5.111e+00 10.256 < 2e-16 ***
## X2ndFlrSF 4.837e+01 4.120e+00 11.741 < 2e-16 ***
## LowQualFinSF 3.112e+01 1.958e+01 1.589 0.112328
## BsmtFullBath 8.980e+03 2.396e+03 3.748 0.000185 ***
## BedroomAbvGr -9.727e+03 1.659e+03 -5.863 5.64e-09 ***
## KitchenAbvGr -1.215e+04 5.082e+03 -2.392 0.016899 *
## TotRmsAbvGrd 5.052e+03 1.223e+03 4.129 3.85e-05 ***
## Fireplaces 3.774e+03 1.745e+03 2.163 0.030741 *
## GarageCars 1.084e+04 1.689e+03 6.415 1.91e-10 ***
## WoodDeckSF 2.577e+01 7.864e+00 3.277 0.001075 **
## ScreenPorch 5.173e+01 1.685e+01 3.071 0.002177 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34690 on 1438 degrees of freedom
## Multiple R-squared: 0.812, Adjusted R-squared: 0.8093
## F-statistic: 295.8 on 21 and 1438 DF, p-value: < 2.2e-16
Lastly, removing the last two variables that are not statistically significant
train_drop_3 <- train_drop_2[, !(names(train_drop_2) %in% c("BsmtUnfSF","LowQualFinSF"))]
model_5 <- lm(SalePrice~.,train_drop_3)
summary(model_5)
##
## Call:
## lm(formula = SalePrice ~ ., data = train_drop_3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -479913 -16324 -2094 14005 287949
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.046e+06 1.155e+05 -9.058 < 2e-16 ***
## MSSubClass -1.709e+02 2.590e+01 -6.597 5.86e-11 ***
## LotFrontage -3.839e+01 1.737e+01 -2.211 0.027221 *
## LotArea 4.478e-01 1.006e-01 4.451 9.19e-06 ***
## OverallQual 1.813e+04 1.137e+03 15.954 < 2e-16 ***
## OverallCond 4.080e+03 9.996e+02 4.082 4.72e-05 ***
## YearBuilt 3.207e+02 5.194e+01 6.174 8.64e-10 ***
## YearRemodAdd 1.809e+02 6.474e+01 2.795 0.005266 **
## MasVnrArea 3.003e+01 5.556e+00 5.405 7.59e-08 ***
## BsmtFinSF1 1.130e+01 2.972e+00 3.804 0.000148 ***
## X1stFlrSF 5.591e+01 4.621e+00 12.099 < 2e-16 ***
## X2ndFlrSF 4.734e+01 4.100e+00 11.548 < 2e-16 ***
## BsmtFullBath 8.330e+03 2.370e+03 3.515 0.000454 ***
## BedroomAbvGr -9.582e+03 1.659e+03 -5.775 9.44e-09 ***
## KitchenAbvGr -1.318e+04 5.058e+03 -2.605 0.009270 **
## TotRmsAbvGrd 5.304e+03 1.210e+03 4.383 1.26e-05 ***
## Fireplaces 3.433e+03 1.741e+03 1.972 0.048850 *
## GarageCars 1.070e+04 1.689e+03 6.334 3.19e-10 ***
## WoodDeckSF 2.541e+01 7.870e+00 3.229 0.001272 **
## ScreenPorch 5.156e+01 1.686e+01 3.058 0.002268 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34740 on 1440 degrees of freedom
## Multiple R-squared: 0.8113, Adjusted R-squared: 0.8088
## F-statistic: 325.8 on 19 and 1440 DF, p-value: < 2.2e-16
Removing the last significant variables, the multiple Rsquared and Adjusted Rsquared did not budge.
par(mfrow=c(2,2))
plot(model_5)
Lastly, lets predict the test set with this model
#Ensure the test df has the variables used in the final linear regression from above
keep_cols <- c("Id","MSSubClass","LotFrontage","LotArea","OverallQual","OverallCond",
"YearBuilt","YearRemodAdd","MasVnrArea","BsmtFinSF1","X1stFlrSF",
"X2ndFlrSF","BsmtFullBath","BedroomAbvGr","KitchenAbvGr","TotRmsAbvGrd",
"Fireplaces","GarageCars","WoodDeckSF","ScreenPorch")
test_pred <- test_df[, (names(test_df) %in% keep_cols)]
predictions <- predict(model_5, newdata = test_pred)
test_pred$SalePrice<-predictions
#Pull out only Id and Sales price from the prediction df
test_pred <- test_pred %>%
dplyr::select(Id,SalePrice)
write.csv(test_pred, "C:\\Users\\jashb\\OneDrive\\Documents\\Masters Data Science\\Spring 2024\\Fundamentals of Computational Mathematics DATA 605\\FINAL_PROJECT\\saleprice_Jburns.csv")
Kaggle Username: jonathanburns22 | Score:0.22866
Thank you for a great semester Dr. Larry, see you in DATA 604 this summer!