Load all the necessary Libraries
library(gridExtra)
library(RColorBrewer)
library(Matrix)
library(scales)
library(corrplot)
library(MASS)
library(psych)
library(ggplot2)
library(DataExplorer)
library(ggpmisc)The following rmarkdown is for the final exam for Data 605
Using R, generate a random variable X that has 10,000 random uniform numbers from 1 to N, where N can be any number of your choosing greater than or equal to 6. Then generate a random variable Y that has 10,000 random normal numbers with a mean of:
\[\mu=\sigma=(N+1)/2\]
set.seed(123)
N = 10
n = 10000
# Random Uniform number generation
X = runif(n, 1, N)
# Random normal number generation
Y = rnorm(n, mean = (N+1)/2, sd = (N+1)/2)
df = data.frame(cbind(X, Y))
plot_histogram(df, title = "Uniform and Normal Distributions generated")Assume the small letter “x” is estimated as the median of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable.
the median is calculated as:
## [1] 5.451109
the 25th percentile is calculated as:
## 25%
## 1.840529
Calculate as a minimum the below probabilities a through c. Interpret the meaning of all probabilities.
# Number of observations where X is greater than y, the 1st quantile of Y
(nXy = nrow(subset(df, X > y)))## [1] 9075
# Probability of X is higher than x, the median of X given that X is greater than y, the 1st quantile of Y.
(pa = nrow(subset(df, X > x & X > y))/nXy)## [1] 0.5509642
The probability is 55%
# Probability that both X and Y are greater than x, the median of X and y, the first quantile of Y respectively.
(pb = nrow(subset(df, X > x & Y > y))/n)## [1] 0.3756
The probability is 37.6%
# Probability that X is less that x, the median of X given that X is greater than Y
(pc = nrow(subset(df, X < x & X > y))/nXy)## [1] 0.4490358
The probability is 44.9%
Investigate whether P(X>x and Y>y) = P(X > x) * P(Y > y) by building a table and evaluating the marginal and joint probabilities.
A1 <- c(sum(X <= x & Y <= y), sum(X > x & Y <= y))
B1 <- c(sum(X <= x & Y > y), sum(X > x & Y > y))
matrix1 <- matrix(c(A1, B1), nrow = 2)
matrix1 <- rbind(matrix1, apply(matrix1, 2, sum))
matrix1 <- cbind(matrix1, apply(matrix1, 1, sum))
xy <- c("less than 1st quartile", " greater than 1st quartile", "Total")
countDF <- data.frame(xy, matrix1)
colnames(countDF) <- c("x/y", "less than 1st quartile", "greater than 1st quartile", "Total")
print(countDF)## x/y less than 1st quartile greater than 1st quartile
## 1 less than 1st quartile 1256 3744
## 2 greater than 1st quartile 1244 3756
## 3 Total 2500 7500
## Total
## 1 5000
## 2 5000
## 3 10000
A <- countDF[2, 4]
B <- countDF[3, 3]
A_B <- countDF[2, 3]
tot <- countDF[3, 4]
Prob_A <- A/tot
Prob_B <- B/tot
prob_A_B <- A_B/tot
print(prob_A_B)## [1] 0.3756
So P(AB) = 0.3755
## [1] 0.375
So, P(X>x and Y>y)=P(X>x)P(Y>y) is TRUE.
Check to see if independence holds by using Fisher’s Exact Test and the Chi Square Test.
What is the difference between the two? Which is most appropriate?
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: M
## X-squared = 0.064533, df = 1, p-value = 0.7995
##
## Fisher's Exact Test for Count Data
##
## data: M
## p-value = 0.7995
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.9242273 1.1100187
## sample estimates:
## odds ratio
## 1.012883
You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques .
I want you to do the following.
The variable selected for the regression analysis include lotarea, saleprice and totalbsmntsf
## rows columns discrete_columns continuous_columns all_missing_columns
## 1 1460 81 43 38 0
## total_missing_values complete_rows total_observations memory_usage
## 1 6965 0 118260 755280
DataExplorer::plot_str(dftrain)
# Select the variables to perform the various statistical operations
X = dftrain$LotArea
Y = dftrain$SalePrice
Z = dftrain$TotalBsmtSFProvide univariate descriptive statistics and appropriate plots for the training data set.
Descriptive statistics on LotArea
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1300 7554 9478 10517 11602 215245
## vars n mean sd median trimmed mad min max range skew
## X1 1 1460 10516.83 9981.26 9478.5 9563.28 2962.23 1300 215245 213945 12.18
## kurtosis se
## X1 202.26 261.22
# BoxPlot and Histograms for the LotArea
par(mfrow=c(1,2))
boxplot(X, main="LotArea BoxPlot")
hist(X, breaks = 20, main = "LotArea Histogram")Descriptive statistics on SalePrice
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34900 129975 163000 180921 214000 755000
## vars n mean sd median trimmed mad min max range skew
## X1 1 1460 180921.2 79442.5 163000 170783.3 56338.8 34900 755000 720100 1.88
## kurtosis se
## X1 6.5 2079.11
# BoxPlot and Histograms for the SalePrice
par(mfrow=c(1,2))
boxplot(Y, main="SalePrice BoxPlot")
hist(Y, breaks = 30, main = "SalePrice Histogram")Descriptive statistics on GarageArea
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 795.8 991.5 1057.4 1298.2 6110.0
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 1460 1057.43 438.71 991.5 1036.7 347.67 0 6110 6110 1.52 13.18
## se
## X1 11.48
BoxPlot and Histograms for the Basementsqft
# BoxPlot and Histograms for the GarageArea
par(mfrow=c(1,2))
boxplot(Z, main="basementArea BoxPlot")
hist(Z, breaks = 30, main = "basementArea Histogram")Provide a scatterplot matrix for at least two of the independent variables and the dependent variable.
# Scatterplot of LotArea and Basementsqft vs SalePrice
par(mfrow=c(1,2))
ggplot(data.frame(X,Y,Z), aes( x = X, y = Y))+
geom_point()+theme_bw()+
geom_smooth(method = "lm")+
labs(x = "LotArea",
y = "SalePrice")+
stat_poly_eq(aes(label = paste(..adj.rr.label.., sep = "~~~")),
label.x.npc = "right", label.y.npc = .9,
formula = y~x, parse = TRUE, size = 3.5)ggplot(data.frame(X,Y,Z), aes( x = Z, y = Y))+
geom_point()+theme_bw()+
geom_smooth(method = "lm")+
labs(x = "basementSqft",
y = "SalePrice")+
stat_poly_eq(aes(label = paste(..adj.rr.label.., sep = "~~~")),
label.x.npc = "left", label.y.npc = .9,
formula = y~x, parse = TRUE, size = 3.5)From the observation there seem to be a strong correlation between the basement square footage and the SalePrice but there seem to be a small correlation between the lot area and the SalePrice although it seems that outliers are present in there.
Derive a correlation matrix for any three quantitative variables in the dataset.
dfsub<- data.frame(LotArea = X, SalePrice = Y, BsmntSqFt=Z )
corMatrix = cor(dfsub, use = "complete.obs")
DataExplorer::plot_correlation(dfsub, ggtheme = theme_bw(),cor_args = list("use" = "pairwise.complete.obs"))From the corellation matrix we can conclude that there exist strong to weak correlation between the three variables SalePrice has a strong correlation with basement and a weak correlation with LotArea with correlation coefficient of 0.61 and 0.26 respectively. LotArea and basementArea have weak correlation with each other with a correlation coefficieant of 0.26
Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval.
# Pairwise corelation between LotArea and SalePrice
cor.test(dfsub$LotArea, dfsub$SalePrice, method = 'pearson', conf.level = 0.80)##
## Pearson's product-moment correlation
##
## data: dfsub$LotArea and dfsub$SalePrice
## t = 10.445, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.2323391 0.2947946
## sample estimates:
## cor
## 0.2638434
# Pairwise corelation between Basementarea and SalePrice
cor.test(dfsub$BsmntSqFt, dfsub$SalePrice, method = 'pearson', conf.level = 0.80)##
## Pearson's product-moment correlation
##
## data: dfsub$BsmntSqFt and dfsub$SalePrice
## t = 29.671, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.5922142 0.6340846
## sample estimates:
## cor
## 0.6135806
# Pairwise corelation between LotArea and Basementarea
cor.test(dfsub$LotArea, dfsub$BsmntSqFt, method = 'pearson', conf.level = 0.80)##
## Pearson's product-moment correlation
##
## data: dfsub$LotArea and dfsub$BsmntSqFt
## t = 10.317, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.2292786 0.2918400
## sample estimates:
## cor
## 0.2608331
Discuss the meaning of your analysis.
The tests for pairwise correlation using pearson method estimated the association between the paired samples and computed a test of the value being zero. All the p-value are less than the significant level alpha = 0.8 and we thus conclude that the pairwise variables are correlated with respective correlation coefficient shown.
Would you be worried about familywise error? Why or why not?
Due to the fact that there are many variables in the train dataset that might have a huge impact on the correlation of the pairwise variables Yes i would be worried. Its possible to reject TRUE NULL Hypothesis unless all other variables are considered.
Invert your correlation matrix from above. This is known as the precision matrix and contains variance inflation factors on the diagonal.
## LotArea SalePrice BsmntSqFt
## LotArea 1.0000000 0.2638434 0.2608331
## SalePrice 0.2638434 1.0000000 0.6135806
## BsmntSqFt 0.2608331 0.6135806 1.0000000
#To invert the Corelation Matrix use the solve() function
precision_matrix = solve(corMatrix)
print(precision_matrix)## LotArea SalePrice BsmntSqFt
## LotArea 1.0932718 -0.1820040 -0.1734874
## SalePrice -0.1820040 1.6341000 -0.9551793
## BsmntSqFt -0.1734874 -0.9551793 1.6313307
Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix.
# To multiply the correlation matrix by the precision matrix
round((corMatrix %*% precision_matrix), 4)## LotArea SalePrice BsmntSqFt
## LotArea 1 0 0
## SalePrice 0 1 0
## BsmntSqFt 0 0 1
## LotArea SalePrice BsmntSqFt
## LotArea 1 0 0
## SalePrice 0 1 0
## BsmntSqFt 0 0 1
Both of the above operations produce an identity matrix.
Conduct LU decomposition on the matrix.
Correlation Matrix
# LU decomposition of the correlation matrix
lud_cor = lu(corMatrix)
elu_cor = expand(lud_cor)
#Lower triangular matrix
cor_L = elu_cor$L
print(cor_L)## 3 x 3 Matrix of class "dtrMatrix" (unitriangular)
## [,1] [,2] [,3]
## [1,] 1.0000000 . .
## [2,] 0.2638434 1.0000000 .
## [3,] 0.2608331 0.5855216 1.0000000
## 3 x 3 Matrix of class "dtrMatrix"
## [,1] [,2] [,3]
## [1,] 1.0000000 0.2638434 0.2608331
## [2,] . 0.9303867 0.5447615
## [3,] . . 0.6129965
Precision Matrix
# LU decomposition of the precision matrix
lud_prec = lu(precision_matrix)
elu_prec = expand(lud_prec)
#Lower triangular matrix
prec_L = elu_prec$L
print(prec_L)## 3 x 3 Matrix of class "dtrMatrix" (unitriangular)
## [,1] [,2] [,3]
## [1,] 1.0000000 . .
## [2,] -0.1664765 1.0000000 .
## [3,] -0.1586864 -0.6135806 1.0000000
## 3 x 3 Matrix of class "dtrMatrix"
## [,1] [,2] [,3]
## [1,] 1.0932718 -0.1820040 -0.1734874
## [2,] . 1.6038006 -0.9840609
## [3,] . . 1.0000000
Multiply the Upper triangular matrix with the lower triangular matrix
## 3 x 3 Matrix of class "dgeMatrix"
## [,1] [,2] [,3]
## [1,] 1.0000000 0.2638434 0.2608331
## [2,] 0.2638434 1.0000000 0.6135806
## [3,] 0.2608331 0.6135806 1.0000000
## 3 x 3 Matrix of class "dgeMatrix"
## [,1] [,2] [,3]
## [1,] 1.0932718 -0.1820040 -0.1734874
## [2,] -0.1820040 1.6341000 -0.9551793
## [3,] -0.1734874 -0.9551793 1.6313307
Multiplication of L & U returns their respective origina matrix as expected.
Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary.
# We will use the GarageAre data. Check minimum and see if shifting is necessary.
z = dfsub$BsmntSqFt
min(z)## [1] 0
Then load the MASS package and run fitdistr to fit an exponential probability density function.
(See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ).
# Fit an exponential probability density function
fit_exp = fitdistr(z, densfun = "exponential")
print(fit_exp)## rate
## 9.456896e-04
## (2.474983e-05)
Find the optimal value of lambda for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, lambda)).
## rate
## 0.0009456896
Plot a histogram and compare it with a histogram of your original variable.
# Plot a histogram of the observed and the simulated data
par(mfrow=c(1,2))
hist(z, breaks = 100, xlab = "observed_basementarea", main = "Observed")
hist(samp, breaks = 100, xlab = "predicted_basementArea", main = "Simulated")Visually from the histograms the simulated data is more heavily skewed to the right while the observed data is more concentrated to the centre.
Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF).
# Find the 5th and 95th percentile of the simulated sample data
quantile(samp, probs = c(0.05, 0.95))## 5% 95%
## 61.24837 2977.49949
Also generate a 95% confidence interval from the empirical data, assuming normality.
# Calculating the 95% confidence interval on the observed data
mean_z = mean(z)
n_z = nrow(z)
sd_z = sd(z)
se = qnorm(0.975) * sd_z/sqrt(n)
left_int = mean_z - se
print(left_int)## [1] 1048.831
## [1] 1066.028
The 95% confidence interval lies between 468.79 and 477.17.
# We plot a histogram to show the assumed normality
normalitytransformed = rnorm(length(z), mean_z, sd_z)
hist(normalitytransformed)Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.
## 5% 95%
## 519.3 1753.0
Build some type of multiple regression model and submit your model to the competition board.
Provide your complete model summary and results with analysis.
Load the selected variables into a dataframe and perform data cleanup operations.
library(dplyr)
library(naniar)
library(tidyselect)
library(caret)
library(purrr)
library(tidymodels)
# Remove features with missing or limited data as evaluated from the summary statistics above.
plot_missing(dftrain,theme_config = theme_bw())a<-miss_var_summary(dftrain) %>% filter(pct_miss>50) %>% select(variable) %>% unlist() %>% as.vector()with this we will remove the features that are sparse
lm_spec<-linear_reg() %>%
set_engine(engine = "lm")
set.seed(1)
split<-initial_split(dftrain,prop=.8)
df_train<-training(split)
df_test<-testing(split)
LMRecipe<- df_train %>% recipe(SalePrice~.) %>%
#remove all non numeric based variables
step_rm(all_of(a)) %>%
step_other(all_nominal(),threshold = .05) %>%
step_dummy(all_nominal()) %>%
step_medianimpute(all_predictors()) %>%
step_corr(all_predictors(),threshold = .9) %>%
#step_BoxCox(all_predictors()) %>%
step_nzv(all_predictors()) %>%
prep()
set.seed(1)
lm_res<-fit_resamples(lm_spec,
LMRecipe,
vfold_cv(df_train, v = 10),
control = control_resamples(save_pred = TRUE))
lm_res %>%
collect_metrics()## # A tibble: 2 x 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 33073. 9 2874. Preprocessor1_Model1
## 2 rsq standard 0.834 9 0.0192 Preprocessor1_Model1
lm_res %>%
unnest(.predictions) %>%
ggplot(aes(SalePrice, .pred, color = id)) +
facet_wrap(~id)+
geom_abline(lty = 2, color = "gray80", size = 1.5) +
geom_point(alpha = 0.5) +
labs(
x = "Truth",
y = "Predicted Sale Price",
color = NULL
)+
stat_poly_eq(aes(label = paste(..adj.rr.label.., sep = "~~~")),
label.x.npc = "left", label.y.npc = .9,
formula = y~x, parse = TRUE, size = 4)+
theme(panel.background = element_blank(),
panel.grid = element_blank())trainset<-
bake(LMRecipe, new_data = df_train)
testset<-
bake(LMRecipe, new_data = df_test)
model1<-lm_spec %>%
fit(SalePrice~.,data = trainset)
(mod1_sum <- summary(model1$fit))##
## Call:
## stats::lm(formula = SalePrice ~ ., data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -392371 -11744 525 10984 215977
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.772e+05 1.328e+06 -0.209 0.834665
## Id 2.044e+00 2.021e+00 1.011 0.312039
## MSSubClass -1.840e+01 6.565e+01 -0.280 0.779327
## LotFrontage 6.841e+01 5.253e+01 1.302 0.193088
## LotArea 3.464e-01 9.300e-02 3.725 0.000206 ***
## OverallQual 1.110e+04 1.202e+03 9.235 < 2e-16 ***
## OverallCond 6.256e+03 1.062e+03 5.893 5.08e-09 ***
## YearBuilt 1.911e+02 8.539e+01 2.238 0.025439 *
## YearRemodAdd 6.970e+01 6.866e+01 1.015 0.310255
## MasVnrArea 2.788e+01 6.961e+00 4.005 6.63e-05 ***
## BsmtFinSF1 3.299e+01 5.127e+00 6.433 1.88e-10 ***
## BsmtFinSF2 3.071e+01 1.015e+01 3.027 0.002527 **
## BsmtUnfSF 2.128e+01 4.714e+00 4.514 7.08e-06 ***
## TotalBsmtSF NA NA NA NA
## X1stFlrSF 2.794e+01 1.869e+01 1.494 0.135380
## X2ndFlrSF 5.586e+01 1.873e+01 2.983 0.002916 **
## GrLivArea 2.016e+01 1.805e+01 1.117 0.264281
## BsmtFullBath 4.271e+03 2.521e+03 1.695 0.090448 .
## BsmtHalfBath 6.627e+01 3.849e+03 0.017 0.986266
## FullBath 2.891e+03 2.765e+03 1.045 0.296114
## HalfBath 2.153e+03 2.630e+03 0.819 0.413094
## BedroomAbvGr -4.388e+03 1.693e+03 -2.592 0.009675 **
## TotRmsAbvGrd 1.393e+03 1.147e+03 1.214 0.224967
## Fireplaces 3.955e+03 1.824e+03 2.169 0.030295 *
## GarageYrBlt 9.104e+01 6.835e+01 1.332 0.183125
## GarageCars 3.051e+03 2.829e+03 1.078 0.281101
## GarageArea 1.122e+01 9.980e+00 1.125 0.261041
## WoodDeckSF 1.854e+01 7.693e+00 2.410 0.016120 *
## OpenPorchSF -1.627e+01 1.470e+01 -1.106 0.268811
## MoSold -3.592e+02 3.145e+02 -1.142 0.253668
## YrSold -1.998e+02 6.567e+02 -0.304 0.760973
## MSZoning_RM -4.996e+03 3.492e+03 -1.431 0.152853
## MSZoning_other -4.588e+03 5.468e+03 -0.839 0.401677
## LotShape_Reg 6.286e+02 2.045e+03 0.307 0.758571
## LandContour_other -8.528e+03 3.493e+03 -2.441 0.014794 *
## LotConfig_CulDSac 1.427e+04 4.188e+03 3.406 0.000684 ***
## LotConfig_Inside -4.953e+02 2.131e+03 -0.232 0.816268
## LandSlope_other 3.698e+03 4.791e+03 0.772 0.440414
## Neighborhood_Edwards -5.677e+03 4.870e+03 -1.166 0.243980
## Neighborhood_Gilbert -5.493e+02 5.116e+03 -0.107 0.914516
## Neighborhood_NAmes 1.203e+03 4.439e+03 0.271 0.786500
## Neighborhood_NridgHt 2.228e+04 5.298e+03 4.205 2.82e-05 ***
## Neighborhood_NWAmes -1.148e+03 5.363e+03 -0.214 0.830515
## Neighborhood_OldTown 3.974e+02 5.588e+03 0.071 0.943318
## Neighborhood_Somerst 1.436e+04 6.234e+03 2.304 0.021441 *
## Neighborhood_other 9.566e+03 3.632e+03 2.634 0.008555 **
## Condition1_Norm 1.027e+04 3.945e+03 2.602 0.009382 **
## Condition1_other -6.319e+03 4.879e+03 -1.295 0.195574
## BldgType_TwnhsE -1.317e+04 7.548e+03 -1.745 0.081202 .
## BldgType_other -1.811e+04 6.947e+03 -2.607 0.009263 **
## HouseStyle_X1Story 1.430e+04 4.230e+03 3.381 0.000749 ***
## HouseStyle_X2Story -5.298e+03 3.818e+03 -1.387 0.165616
## HouseStyle_SLvl 6.489e+03 5.415e+03 1.198 0.231029
## RoofStyle_Hip 9.942e+02 2.412e+03 0.412 0.680269
## Exterior1st_Plywood 8.083e+02 5.723e+03 0.141 0.887722
## Exterior1st_Wd.Sdng -2.866e+03 6.394e+03 -0.448 0.654113
## Exterior1st_other 1.403e+04 5.455e+03 2.573 0.010225 *
## Exterior2nd_MetalSd 6.473e+03 3.456e+03 1.873 0.061323 .
## Exterior2nd_Plywood -6.115e+03 5.423e+03 -1.128 0.259734
## Exterior2nd_VinylSd 2.760e+03 3.209e+03 0.860 0.389921
## Exterior2nd_Wd.Sdng 7.488e+03 6.622e+03 1.131 0.258387
## Exterior2nd_other -5.033e+03 5.564e+03 -0.905 0.365928
## MasVnrType_None 8.163e+03 2.784e+03 2.933 0.003432 **
## MasVnrType_Stone 8.392e+03 3.727e+03 2.252 0.024542 *
## ExterQual_TA -6.090e+03 2.984e+03 -2.041 0.041518 *
## ExterCond_TA -5.744e+02 2.817e+03 -0.204 0.838451
## Foundation_CBlock 3.583e+03 3.642e+03 0.984 0.325452
## Foundation_PConc 5.739e+03 4.096e+03 1.401 0.161439
## BsmtQual_Gd -1.813e+04 3.357e+03 -5.401 8.16e-08 ***
## BsmtQual_TA -1.688e+04 3.674e+03 -4.595 4.84e-06 ***
## BsmtCond_other -3.687e+03 3.374e+03 -1.093 0.274716
## BsmtExposure_Gd 1.548e+04 3.878e+03 3.992 7.01e-05 ***
## BsmtExposure_Mn -5.628e+03 3.965e+03 -1.419 0.156060
## BsmtExposure_No -8.096e+03 2.795e+03 -2.897 0.003845 **
## BsmtFinType1_BLQ -1.237e+03 3.525e+03 -0.351 0.725650
## BsmtFinType1_GLQ 3.400e+03 3.160e+03 1.076 0.282214
## BsmtFinType1_LwQ -6.436e+03 4.645e+03 -1.386 0.166159
## BsmtFinType1_Rec -3.754e+03 3.701e+03 -1.014 0.310606
## BsmtFinType1_Unf -2.399e+03 3.522e+03 -0.681 0.495862
## BsmtFinType2_other -5.539e+03 4.562e+03 -1.214 0.224938
## HeatingQC_Gd -8.272e+02 2.630e+03 -0.315 0.753182
## HeatingQC_TA -5.277e+02 2.494e+03 -0.212 0.832453
## CentralAir_Y -3.891e+01 4.373e+03 -0.009 0.992902
## Electrical_SBrkr 1.952e+03 3.626e+03 0.538 0.590364
## KitchenQual_Gd -2.182e+04 3.515e+03 -6.208 7.67e-10 ***
## KitchenQual_TA -1.852e+04 3.722e+03 -4.976 7.57e-07 ***
## Functional_other -1.464e+04 3.651e+03 -4.009 6.52e-05 ***
## FireplaceQu_TA 2.483e+02 2.637e+03 0.094 0.925004
## GarageType_BuiltIn -4.800e+03 4.097e+03 -1.172 0.241600
## GarageType_Detchd 2.173e+03 2.769e+03 0.785 0.432827
## GarageFinish_RFn -4.212e+03 2.458e+03 -1.714 0.086847 .
## GarageFinish_Unf -1.434e+03 2.936e+03 -0.488 0.625318
## PavedDrive_Y 4.308e+03 3.668e+03 1.174 0.240464
## SaleType_WD -7.130e+04 2.069e+04 -3.446 0.000590 ***
## SaleType_other -6.687e+04 2.065e+04 -3.237 0.001243 **
## SaleCondition_Normal 6.007e+03 3.045e+03 1.973 0.048782 *
## SaleCondition_Partial -4.967e+04 2.071e+04 -2.398 0.016651 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27850 on 1073 degrees of freedom
## Multiple R-squared: 0.8907, Adjusted R-squared: 0.881
## F-statistic: 92.04 on 95 and 1073 DF, p-value: < 2.2e-16
linearM_results<- model1 %>%
predict(new_data = trainset) %>%
mutate(
truth = trainset$SalePrice,
model = "LinearModel",
set = "Training Set"
) %>%
bind_rows(model1 %>%
predict(new_data = testset) %>%
mutate(
truth = testset$SalePrice,
model = "LinearModel",
set = "Testing Set"))
variableImportancePlot <- function(model=NULL, chart_title='Variable Importance Plot') {
# Make sure a model was passed
if (is.null(model)) {
return
}
# use caret and gglot to print a variable importance plot
varImp(model) %>% as.data.frame() %>% top_n(n = 10) %>%
ggplot(aes(x = reorder(rownames(.), desc(Overall)), y = Overall)) +
geom_col(aes(fill = Overall)) +
theme(panel.background = element_blank(),
panel.grid = element_blank(),
axis.text.x = element_text(angle = 90)) +
scale_fill_gradient() +
labs(title = chart_title,
x = "Parameter",
y = "Relative Importance")
}
variableImportancePlot(model1$fit, "Linear Model 1 Variable Importance")linearM_results %>%
ggplot(aes(x = truth,y = .pred))+
geom_point(pch =21, alpha = 0.3, fill = "skyblue") +
geom_smooth(method = "lm", color = "red3") +
facet_wrap(set~model, scales = 'free')+
stat_poly_eq(aes(label = paste(..adj.rr.label.., sep = "~~~")),
label.x.npc = "left", label.y.npc = .9,
formula = y~x, parse = TRUE, size = 4)+
theme(panel.background = element_blank(),
panel.grid = element_blank())The model produces a Multiple R-Squared of 0.89 on the test set which is great. We can interpret this to mean that 89% variance in the saleprice van be explained by the predictor variables in the current model. however the training set produced a lower R2 around 72% of variance being explained. Significant P values were obtained. with this we can test this against our kaggle dataset.
Let us test how good our model performs on the test data set.
# Load Testing set
dftest = read.csv("test.csv")
dftest$SalePrice<-NA
dftest$BsmtFinSF2<-NA
LMRecipe<- dftrain %>% recipe(SalePrice~.) %>%
#remove all non numeric based variables
step_rm(all_of(a)) %>%
step_rm('BsmtFinSF2') %>%
step_other(all_nominal(),threshold = .05) %>%
step_dummy(all_nominal()) %>%
step_medianimpute(all_predictors()) %>%
step_corr(all_predictors(),threshold = .9) %>%
#step_BoxCox(all_predictors()) %>%
step_nzv(all_predictors()) %>%
prep()
# Predict the saleprice for the testing data set
#
trainset<-
bake(LMRecipe, new_data = dftrain)
testset<-
bake(LMRecipe, new_data = dftest)
model1<-lm_spec %>%
fit(SalePrice~.,data = trainset)
linearM_results<- model1 %>%
predict(new_data = testset) %>%
mutate(Id = testset$Id,
SalePrice = .pred
) %>% select(-.pred)
#write.csv(linearM_results, file = "kagglesubmission.csv", row.names = F)
#View the first 5 rows of the data frame
print(head(linearM_results, 5))## # A tibble: 5 x 2
## Id SalePrice
## <int> <dbl>
## 1 1461 81008.
## 2 1462 150425.
## 3 1463 171205.
## 4 1464 182365.
## 5 1465 167906.
Report your Kaggle.com user name and score.
Kaggle Username: Joshuargst
Kaggle Score: 0.32199 and ranked 3972 on the leaderboads