#load libraries and set seed
library(dplyr)
library(ggplot2)
library(caret)
library(MASS)
library(RANN)
set.seed(1989)


#read training and testing data
train_df <- read.csv("https://raw.githubusercontent.com/mollysiebecker/DATA-605-Final/main/train.csv")
test_df <- read.csv("https://raw.githubusercontent.com/mollysiebecker/DATA-605-Final/main/test.csv")

Pick one of the quanititative independent variables from the training data set (train.csv) , and define that variable as X. Make sure this variable is skewed to the right! Pick the dependent variable and define it as Y.

summary(train_df$LotArea)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1300    7554    9478   10517   11602  215245
X <- train_df$LotArea
Y <- train_df$SalePrice

I will use Lot Area as the indpendent variable and Sale Price as the dependent variable.

Probability

Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 3d quartile of the X variable, and the small letter “y” is estimated as the 2d quartile of the Y variable. Interpret the meaning of all probabilities. In addition, make a table of counts as shown below.

a.) P(X>x | Y>y)

This represents the probability that lot area is above the third quartile, given that the sale price is above the median. The code below demonstrates that this probability is about 0.38.

#define x and y
x <- quantile(X, 0.75)
y <- quantile(Y, 0.5)

#filter for observations in which Y>y, and both X>x and Y>y
count_y_above_med <- train_df %>% filter(SalePrice > y) %>% nrow()
count_x_and_y_above <- train_df %>% filter(SalePrice > y) %>% filter(LotArea > x) %>% nrow()

#calculate P(X>x | Y>y)
p_a <- count_x_and_y_above/count_y_above_med
print(p_a)
## [1] 0.3791209

b.) P(X>x, Y>y)

This represents the probability that the lot area is above the third quartile, and the sale price is above the median. The code below demonstrates that this probability is about 0.19.

#count total number of observations
count_total <- nrow(train_df)

#calculate P(X>x, Y>y)
p_b <- count_x_and_y_above/count_total
print(p_b)
## [1] 0.1890411

c.) P(X<x | Y>y)

This represents the probability that the lot area is below the third quartile, given that the sale price is above the median. The code below demonstrates that this probability is about 0.62.

#filter for observations in which X<x and Y>y
count_x_below_y_above <- train_df %>% filter(SalePrice > y) %>% filter(LotArea < x) %>% nrow()

#calculate P(X<x | Y>y) 
p_c <- count_x_below_y_above/count_y_above_med
print(p_c)
## [1] 0.6208791

Two Way Table

count_x_and_y_leq <- as.numeric(nrow(train_df %>% 
      filter(SalePrice <= y) %>% 
      filter(LotArea <= x)))
count_x_leq_y_above <- as.numeric(nrow(train_df %>% 
      filter(SalePrice > y) %>% 
      filter(LotArea <= x)))
count_x_above_y_leq <- as.numeric(nrow(train_df %>% 
      filter(SalePrice <= y) %>% 
      filter(LotArea > x)))
twt_A <- c(count_x_and_y_leq, count_x_above_y_leq, sum(count_x_and_y_leq, count_x_above_y_leq))
twt_B <- c(count_x_leq_y_above, count_x_and_y_above, sum(count_x_leq_y_above, count_x_and_y_above))
twt_C <- c(sum(count_x_and_y_leq, count_x_leq_y_above), sum(count_x_above_y_leq, count_x_and_y_above), count_total)

two_way_table <- data.frame(twt_A, twt_B, twt_C)
colnames(two_way_table) <- c("Y<= 2nd quartile", "Y>2nd quartile", "Total")
rownames(two_way_table) <- c("X<= 3rd quartile", "X>3rd quartile", "Total")
print(two_way_table)
##                  Y<= 2nd quartile Y>2nd quartile Total
## X<= 3rd quartile              643            452  1095
## X>3rd quartile                 89            276   365
## Total                         732            728  1460

Checking for Association

Does splitting the training data in this fashion make them independent? Let A be the new variable counting those observations above the 3d quartile for X, and let B be the new variable counting those observations above the 2d quartile for Y. Does P(A|B)=P(A)P(B)? Check mathematically, and then evaluate by running a Chi Square test for association.

Calculating marginal and joint probabilities:

#define A and B
A <- train_df %>% filter(LotArea > x) %>% nrow()
B <- train_df %>% filter(SalePrice > y) %>% nrow()
  
#calculating marginal probabilities
p_A <- A/count_total
p_B <- B/count_total


#calculate joint probability assuming independence
p_A*p_B
## [1] 0.1246575
#calculate actual joint probability
count_x_and_y_above/count_y_above_med
## [1] 0.3791209

Splitting the training data in this fashion does not make the variables independent. There is a clear association between the variables as \(P(A|B)\neq P(A)P(B)\). If Y is above the median, X is more likely to be above the third quartile. This indicates it may be appropriate to create a model to explore the relationship between these variables further.

Chi Square Test

We can confirm this using a Chi Square Test to look for an association between the variables, with the null hypothesis that there is no association.

chisq_df <- two_way_table[1:2, 1:2]
chisq.test(chisq_df)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  chisq_df
## X-squared = 127.74, df = 1, p-value < 2.2e-16

The Chi-squared test confirms that there is an association between the variables. The very low p-value indicates we should reject the null hypothesis that there is no association between the variables.

Descriptive and Inferential Statistics

Provide univariate descriptive statistics and appropriate plots for the training data set.

summary(train_df$LotArea)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1300    7554    9478   10517   11602  215245
summary(train_df$SalePrice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129975  163000  180921  214000  755000
ggplot() +
  geom_histogram(data = train_df, aes(x = LotArea), fill = "blue", alpha = 0.8, bins = 100) +
  labs(title = "Distribution of Lot Area",
       x = "Lot Area (square feet)",
       y = "Count") +
  theme_minimal()

ggplot() +
  geom_histogram(data = train_df, aes(x = SalePrice), fill = "red", alpha = 0.8, bins = 100) +
  labs(title = "Distribution of Sale Price",
       x = "Sale Price (dollars)",
       y = "Count") +
  theme_minimal()

Provide a scatterplot of X and Y.

ggplot(train_df, aes(x=LotArea, y=SalePrice)) + geom_point()

Difference in Means

Provide a 95% CI for the difference in the mean of the variables.

#filter data frames for Lot Area
above_3q_lot_area_df <- train_df %>% filter(LotArea>x)
below_3q_lot_area_df <- train_df %>% filter(LotArea<=x)

#run t test on Sale Prices
t_test_result <- t.test(above_3q_lot_area_df$SalePrice, below_3q_lot_area_df$SalePrice)

#extract confidence interval
diff_means_ci <- t_test_result$conf.int
print(diff_means_ci)
## [1] 61945.67 84268.88
## attr(,"conf.level")
## [1] 0.95

We can be 95% confidence that the true difference in mean sale prices of homes with a lot area in the top quartile and homes with a lot area not in the top quartile is in the interval (61945.67, 84268.88). This provides more evidence that homes that have larger lot areas tend to have greater sale prices than those that do not.

Correlation Matrix

Derive a correlation matrix for two of the quantitative variables you selected.

I included two of the predictor variables and the target variable in order to compare multiple correlations with the target variable, and to inspect correlation between predictor variables.

selected_variables <- train_df[, c("LotArea", "GrLivArea", "SalePrice")]
correlation_matrix <- cor(selected_variables)

print(correlation_matrix)
##             LotArea GrLivArea SalePrice
## LotArea   1.0000000 0.2631162 0.2638434
## GrLivArea 0.2631162 1.0000000 0.7086245
## SalePrice 0.2638434 0.7086245 1.0000000

We have 1’s along the diagonal since each variable is perfectly correlated with itself. The living area has a moderately strong positive correlation with the sale price, while the lot area has a weak correlation. Interestingly, the lot area has almost the same correlation with the living area as with the sale price.

Test the hypothesis that the correlation between these variables is 0 and provide a 99% confidence interval. Discuss the meaning of your analysis.

cor.test(selected_variables$LotArea, selected_variables$SalePrice, conf.level=0.99)
## 
##  Pearson's product-moment correlation
## 
## data:  selected_variables$LotArea and selected_variables$SalePrice
## t = 10.445, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
##  0.2000196 0.3254375
## sample estimates:
##       cor 
## 0.2638434
cor.test(selected_variables$LotArea, selected_variables$GrLivArea, conf.level=0.99)
## 
##  Pearson's product-moment correlation
## 
## data:  selected_variables$LotArea and selected_variables$GrLivArea
## t = 10.414, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
##  0.1992693 0.3247386
## sample estimates:
##       cor 
## 0.2631162
cor.test(selected_variables$GrLivArea, selected_variables$SalePrice, conf.level=0.99)
## 
##  Pearson's product-moment correlation
## 
## data:  selected_variables$GrLivArea and selected_variables$SalePrice
## t = 38.348, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
##  0.6733974 0.7406408
## sample estimates:
##       cor 
## 0.7086245

For each pair of variables, we reject the null hypothesis that the correlation is 0 because none of the 99% confidence intervals include 0. We confirm the result that there is a weak positive correlation between lot area and sale price, and between lot area and living area. We can be 99% confident that the true correlation coefficient between Lot Area and Sale Price is between 0.2000196 and 0.3254375, and that the true correlation coefficient between GrLivArea and Sale Price is between 0.6733974 and 0.7406408. Interestlingly, the confidence interval of the relationship with the strongest correlation is also the smallest interval.

In the context of our data set, this means that an increase in the size of the living area is a reliable predictor of an increase in the sale price, and less so for an increase in the amount of land.

Linear Algebra and Correlation

Invert your correlation matrix. (This is known as the precision matrix and contains variance inflation factors on the diagonal.)

precision_matrix <- solve(correlation_matrix)
print(precision_matrix)
##              LotArea  GrLivArea  SalePrice
## LotArea    1.0884485 -0.1664868 -0.1692033
## GrLivArea -0.1664868  2.0340972 -1.3974846
## SalePrice -0.1692033 -1.3974846  2.0349350

Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix.

c_by_p_matrix <- round(correlation_matrix %*% precision_matrix,2)
print(c_by_p_matrix)
##           LotArea GrLivArea SalePrice
## LotArea         1         0         0
## GrLivArea       0         1         0
## SalePrice       0         0         1
p_by_c_matrix <- round(precision_matrix %*% correlation_matrix,2)
print(p_by_c_matrix)
##           LotArea GrLivArea SalePrice
## LotArea         1         0         0
## GrLivArea       0         1         0
## SalePrice       0         0         1

When rounding the values, both products are identity matrices since the original matrices are inverses of one another.

PCA

Conduct principal components analysis (research this!) and interpret.

mypca=princomp(x = selected_variables, cor = T)
lambda=mypca$sdev[1]^2
print(lambda)
##   Comp.1 
## 1.868492

The value of lambda indicates how much of the variance is explained by the given variables in the first principal component.

I also performed principal component analysis using all the quantitative variables. Before doing this, I imputed the training data for later model creation so there would be no missing values.

Imputing Data

For many of the categorical variables, “NA” was not indicative of a missing value, but rather a sign that the specified feature, such as a garage or basement, was not present in the house (according to the description of the data set.) I replaced these NA’s with strings to indicate there was no basement, no garage, etc. For categorical variables with actual missing data, I replaced the NA with the most common value of that variable.

train_df$Alley[is.na(train_df$Alley)] <- "none"
train_df$BsmtQual[is.na(train_df$BsmtQual)] <- "NB"
train_df$BsmtCond[is.na(train_df$BsmtCond)] <- "NB"
train_df$BsmtExposure[is.na(train_df$BsmtExposure)] <- "NB"
train_df$BsmtFinType1[is.na(train_df$BsmtFinType1)] <- "NB"
train_df$BsmtFinType2[is.na(train_df$BsmtFinType2)] <- "NB"
train_df$FireplaceQu[is.na(train_df$FireplaceQu)] <- "NF"
train_df$GarageType[is.na(train_df$GarageType)] <- "NG"
train_df$GarageFinish[is.na(train_df$GarageFinish)] <- "NG"
train_df$GarageQual[is.na(train_df$GarageQual)] <- "NG"
train_df$GarageCond[is.na(train_df$GarageCond)] <- "NG"
train_df$PoolQC[is.na(train_df$PoolQC)] <- "NP"
train_df$Fence[is.na(train_df$Fence)] <- "NF"
train_df$MiscFeature[is.na(train_df$MiscFeature)] <- "none"
train_df$MasVnrType[is.na(train_df$MasVnrType)] <- "None"
train_df$Electrical[is.na(train_df$Electrical)] <- "SBrkr"

Then, I imputed missing quantitative values with the median of that variable using the caret package and “medianImpute.”

# Pre-process data
preProcess_median <- preProcess(train_df, method = "medianImpute")

# Impute the missing values
data_imputed <- predict(preProcess_median, train_df)

The training data set now has no missing values:

any(is.na(data_imputed))
## [1] FALSE
# principal component analysis, all quantitative variables
quantitative_imputed_data <- select_if(data_imputed, is.numeric)
mypca2=princomp(x = quantitative_imputed_data, cor = T)
lambda2=mypca2$sdev[1]^2
print(lambda2)
##   Comp.1 
## 7.895225

When all the quantitative variables are included, the lambda of ~7.9 is higher than the previous lambda of ~1.87 because including more variables means that more of the variance in the sale price is explained by the combination of variables included in the first principal component.

We can see that the first 18 components account for 80% of the variation.

mycomponents=mypca2$sdev^2/sum(mypca2$sdev^2)
sum(mycomponents[1:18])
## [1] 0.805013

We can examine the loadings of each component in a data frame to see which variables are included and are most influential (which have the greatest absolute value.)

loadings_all_components <- mypca2$loadings

The screeplot below shows that after the first 6 principal components, the additional amount of variance explained tapers off significantly.

screeplot(mypca2, type = "lines")

We can also view one component at a time to more clearly see the impact of each variable, as with the sorted list of loadings of the first principal component below. Overall quality, above ground living area, and the number of cars in the garage/ garage area have the most significance in the first principal component.

loadings_comp1 <- mypca2$loadings[, 1]

sorted_loadings_comp1 <- sort(loadings_comp1, decreasing = TRUE)

print(sorted_loadings_comp1)
##   OverallCond EnclosedPorch  KitchenAbvGr    MSSubClass        YrSold 
##   0.073516386   0.068852443   0.021280085   0.017918622   0.012492853 
##  BsmtHalfBath    BsmtFinSF2  LowQualFinSF       MiscVal            Id 
##   0.011540503   0.010573653   0.010346325   0.010220794   0.002993571 
##    X3SsnPorch        MoSold   ScreenPorch      PoolArea  BsmtFullBath 
##  -0.016163506  -0.020528394  -0.031251154  -0.046613664  -0.074132796 
##  BedroomAbvGr       LotArea     BsmtUnfSF      HalfBath     X2ndFlrSF 
##  -0.102403251  -0.102548433  -0.107687474  -0.119029333  -0.126700691 
##    WoodDeckSF    BsmtFinSF1   OpenPorchSF   LotFrontage    Fireplaces 
##  -0.127797843  -0.136997461  -0.140467675  -0.151315946  -0.176468946 
##    MasVnrArea  YearRemodAdd   GarageYrBlt     YearBuilt  TotRmsAbvGrd 
##  -0.186008612  -0.199228368  -0.206154205  -0.224234623  -0.226664538 
##     X1stFlrSF      FullBath   TotalBsmtSF    GarageArea    GarageCars 
##  -0.244870403  -0.245962690  -0.247004256  -0.263893733  -0.268064137 
##     GrLivArea   OverallQual     SalePrice 
##  -0.284443119  -0.290450262  -0.317419502

Calculus-Based Probability & Statistics

Many times, it makes sense to fit a closed form distribution to data. For your variable that is skewed to the right, shift it so that the minimum value is above zero. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ).

Fit exponential distribution

exp_fit <-fitdistr(train_df$LotArea, densfun = "exponential")
print(exp_fit)
##        rate    
##   9.508570e-05 
##  (2.488507e-06)

The fitted exponential distribution has lambda equal to approximately 0.000095.

Take samples

Find the optimal value of λ for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, λ))

lambda_exp <- exp_fit$estimate
print(lambda_exp)
##        rate 
## 9.50857e-05
exp_samples <- rexp(1000, lambda_exp)

Compare Histograms

Plot a histogram and compare it with a histogram of your original variable.

ggplot() +
  geom_histogram(data = train_df, aes(x = LotArea), fill = "blue", alpha = 0.5, bins = 100) +
  geom_histogram(aes(x = exp_samples), fill = "red", alpha = 0.5, bins = 100) +
  labs(title = "Comparing Distribution of Lot Area and Fitted Exponential Distribution",
       x = "Value",
       y = "Count") +
  theme_minimal()

summary(train_df$LotArea)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1300    7554    9478   10517   11602  215245
summary(exp_samples)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     7.22  3024.59  6756.66 10045.36 13977.68 86270.16

The original distribution of lot area has less variance around the mean than the fitted exponential distribution does, which we can see from the higher blue bars closer to the mean and the higher red bars the further away from the mean you go. The lesser spread seen visually is also confirmed by the smaller IQR for the original distribution. Additionally, the fitted distribution has both a lower minimum and lower maximum.

Compare Percentiles

Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

percentiles <- qexp(c(0.05, 0.95), rate = lambda_exp)
print(percentiles)
## [1]   539.4428 31505.6013
conf_interval <- t.test(train_df$LotArea)$conf.int
print(conf_interval)
## [1] 10004.42 11029.24
## attr(,"conf.level")
## [1] 0.95
empirical_percentiles <- quantile(train_df$LotArea, c(0.05, 0.95))
print(empirical_percentiles)
##       5%      95% 
##  3311.70 17401.15

As seen in the overlaid histograms, the 5th percentile of the exponential distribution (539.44) is smaller than that of the empiricial data (3311.70.) However, even though the maximum of the original distribution is greater than the maximum of the fitted distribution, the 95th percentile of the original distribution (17401.15) is less than the 95th percentile of the fitted distribution (31505.60.) This illustrates the degree of the skew and the presence of outliers in the original distribution, as well as the lesser spread discussed earlier in the histogram comparison, and highlights the differences between the original and fitted distributions.

When examining the 95% confidence interval (10004.42, 11029.24) for the mean of the original distribution, we see that the mean of the mean of the fitted distribution (10045.36) falls within this interval, indicating a measure by which the fitted distribution approximates the original distribution well.

Modeling

Build some type of regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.

Train Model

The code for creating the model is below. I used the “glmnet” method to prevent overfitting and account for multicollinearity since there are many variables, and then removed variables that seemed not to have as significant an impact on the model (this is discussed further in the conclusion.)

model <- train(SalePrice ~ . -Id -Alley -LotShape -LotFrontage -LandContour -HouseStyle -FireplaceQu -OpenPorchSF -EnclosedPorch -GarageFinish -GarageYrBlt -BsmtHalfBath -ExterCond -CentralAir -PavedDrive -Electrical -Heating -MiscFeature -MiscVal -YrSold, 
                  data = data_imputed, 
                  method = "glmnet",
                  trControl = trainControl(method = "cv"),
                  tuneLength = 10,
                  preProc = c("center", "scale"),
                  tuneGrid = expand.grid(alpha = 0, lambda = seq(0.001, 1, length = 10)))

Make Predictions

Before making predictions, I imputed missing data on the testing data frame using the same process for imputing data on the training data frame.

#impute test df
test_df$Alley[is.na(test_df$Alley)] <- "none"
test_df$BsmtQual[is.na(test_df$BsmtQual)] <- "NB"
test_df$BsmtCond[is.na(test_df$BsmtCond)] <- "NB"
test_df$BsmtExposure[is.na(test_df$BsmtExposure)] <- "NB"
test_df$BsmtFinType1[is.na(test_df$BsmtFinType1)] <- "NB"
test_df$BsmtFinType2[is.na(test_df$BsmtFinType2)] <- "NB"
test_df$FireplaceQu[is.na(test_df$FireplaceQu)] <- "NF"
test_df$GarageType[is.na(test_df$GarageType)] <- "NG"
test_df$GarageFinish[is.na(test_df$GarageFinish)] <- "NG"
test_df$GarageQual[is.na(test_df$GarageQual)] <- "NG"
test_df$GarageCond[is.na(test_df$GarageCond)] <- "NG"
test_df$PoolQC[is.na(test_df$PoolQC)] <- "NP"
test_df$Fence[is.na(test_df$Fence)] <- "NF"
test_df$MiscFeature[is.na(test_df$MiscFeature)] <- "none"
test_df$MasVnrType[is.na(test_df$MasVnrType)] <- "None"
test_df$Electrical[is.na(test_df$Electrical)] <- "SBrkr"
test_df$Exterior1st[is.na(test_df$Exterior1st)] <- "VinylSd"
test_df$Exterior2nd[is.na(test_df$Exterior2nd)] <- "VinylSd"
test_df$Utilities[is.na(test_df$Utilities)] <- "AllPub"
test_df$MSZoning[is.na(test_df$MSZoning)] <- "RL"
test_df$KitchenQual[is.na(test_df$KitchenQual)] <- "TA"
test_df$SaleType[is.na(test_df$SaleType)] <- "WD"
test_df$Functional[is.na(test_df$Functional)] <- "Typ"
# Apply medianImpute
preProcess_median <- preProcess(test_df, method = "medianImpute")

# Impute the missing values
test_df <- predict(preProcess_median, test_df)

The testing data set now has no missing values:

any(is.na(test_df))
## [1] FALSE
#make predictions
predictions <- predict(model, newdata = test_df)

#create data frame for submission to Kaggle
submission <- data.frame(Id = test_df$Id, SalePrice = predictions)

#write data frame to csv
write.csv(submission, "predictions_1.csv", row.names = FALSE)

Conclusion

I submitted six different sets of predictions from various models to Kaggle, but the closest predictions were from my first model shown above. (My Kaggle username is mollysiebecker, and my best score was 0.15318.) In subsequent models (shown below,) I tried different combinations of backwards elimination by identifying variables with the highest p-values, maximizing the adjusted r^2 (as in the second model, which had an adjusted r^2 of 0.9208,) or removing the quantitative variables that accounted for the least variation in the principal component analysis. Most of the models had a Kaggle score in the range of 0.15318 to 0.15765, except for the second submission, which had a score of 0.18385. This was the only model that used the lm() function from base r, and not the “glmnet” method from the caret package. I was surprised that the model that had an extremely high adjusted r^2 had the least accurate predictions, but I suppose this could be due to overfitting. I also think this is fairly good evidence that the “glmnet” method is better at handling regression with a large number of variables and preventing overfitting.

#other less accurate models

model2 <- lm(SalePrice ~ . -Id -MiscVal -OpenPorchSF -EnclosedPorch -MiscFeature -PavedDrive -GarageYrBlt -TotalBsmtSF -GrLivArea -LotFrontage -Alley -LotShape -HouseStyle -Exterior2nd -ExterCond -Heating -Electrical -BsmtHalfBath -GarageType -YrSold -HalfBath -LowQualFinSF -CentralAir -BsmtFinType2, data = data_imputed)
options(max.print = 10000)


model3 <- train(SalePrice ~ . -Id -MiscVal -OpenPorchSF -EnclosedPorch -MiscFeature -PavedDrive -GarageYrBlt -TotalBsmtSF -GrLivArea -LotFrontage -Alley -LotShape -HouseStyle -Exterior2nd -ExterCond -Heating -Electrical -BsmtHalfBath -GarageType -YrSold -HalfBath -LowQualFinSF -CentralAir -BsmtFinType2, 
                  data = data_imputed, 
                  method = "glmnet",
                  trControl = trainControl(method = "cv"),
                  tuneLength = 10,
                  preProc = c("center", "scale"),
                  tuneGrid = expand.grid(alpha = 0, lambda = seq(0.001, 1, length = 10)))


model4 <- train(SalePrice ~ . -Id -Alley -LotShape -LandContour -HouseStyle -FireplaceQu -GarageFinish  -BsmtHalfBath -ExterCond -CentralAir -PavedDrive -Electrical -Heating -MiscFeature -MiscVal -YrSold -X3SsnPorch -MoSold,
                  data = data_imputed, 
                  method = "glmnet",
                  trControl = trainControl(method = "cv"),
                  tuneLength = 10,
                  preProc = c("center", "scale"),
                  tuneGrid = expand.grid(alpha = 0, lambda = seq(0.001, 1, length = 10)))


model5 <- train(SalePrice ~ . -Id -Alley -LotShape -LandContour -HouseStyle -FireplaceQu -GarageFinish  -BsmtHalfBath -ExterCond -CentralAir -PavedDrive -Electrical -Heating -MiscFeature -MiscVal -YrSold,
                  data = data_imputed, 
                  method = "glmnet",
                  trControl = trainControl(method = "cv"),
                  tuneLength = 10,
                  preProc = c("center", "scale"),
                  tuneGrid = expand.grid(alpha = 0, lambda = seq(0.001, 1, length = 10)))


model6 <- train(SalePrice ~ . -Id -Alley -LotShape -LandContour -HouseStyle -FireplaceQu -GarageFinish -ExterCond -CentralAir -PavedDrive -Electrical -Heating -MiscFeature -MiscVal -X3SsnPorch,
                  data = data_imputed, 
                  method = "glmnet",
                  trControl = trainControl(method = "cv"),
                  tuneLength = 10,
                  preProc = c("center", "scale"),
                  tuneGrid = expand.grid(alpha = 0, lambda = seq(0.001, 1, length = 10)))