## loaded_packages
## prettydoc TRUE
## dplyr TRUE
## psych TRUE
## knitr TRUE
## ggplot2 TRUE
## MASS TRUE
## car TRUE
## MLmetrics TRUE
## broom TRUE
## randomForest TRUE
## e1071 TRUE
## forcats TRUE
## mice TRUE
## missForest TRUE
Your final is due by the end of day on 12/20/2017. You should post your solutions to your GitHub account. You are also expected to make a short presentation during our last meeting (3-5 minutes) or post a recording to the board. This project will show off your ability to understand the elements of the class.
You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques .
I want you to do the following.
Pick one of the quantitative independent variables from the training data set (train.csv) , and define that variable as X. Pick SalePrice as the dependent variable, and define it as Y for the next analysis.
Probability.
# load CSVs changed 2 Likert scales & 2 date-component values to categorical
# variables
# https://ww2.amstat.org/publications/jse/v19n3/decock/DataDocumentation.txt
train_csv <- read.csv("train.csv", colClasses = c(MSSubClass = "factor", MoSold = "factor",
YrSold = "factor", OverallQual = "factor", OverallCond = "factor"))
test_csv <- read.csv("test.csv", colClasses = c(MSSubClass = "factor", MoSold = "factor",
YrSold = "factor", OverallQual = "factor", OverallCond = "factor"))https://www.theanalysisfactor.com/likert-scale-items-as-predictor-variables-in-regression/ https://www.researchgate.net/post/how_independent_variables_measured_on_likert_scale_should_be_treated_in_binary_logistic_regression_as_continuous_variables_or_ordinal_variables https://www.researchgate.net/post/Is_a_Likert-type_scale_ordinal_or_interval_data https://stats.stackexchange.com/questions/121907/using-years-when-calculating-linear-regression
Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 1st quartile of the X variable, and the small letter “y” is estimated as the 2d quartile of the Y variable. Interpret the meaning of all probabilities.
X <- train_csv$X1stFlrSF
Y <- train_csv$SalePrice
XY <- cbind(X, Y)
total_length <- nrow(XY)
x <- quantile(X, 0.25)
y <- median(Y)- \(P(X>x | Y>y)\)
Y_greater_y <- data.frame(subset(XY, Y > y))
len_Y_greater_y <- nrow(Y_greater_y)
X_greater_x <- subset(Y_greater_y, X > x)
len_X_greater_x <- nrow(X_greater_x)
(`P(X>x | Y>y)` <- len_X_greater_x/len_Y_greater_y)## [1] 0.8543956
- \(P(X>x & Y>y)\)
(`P(X>x & Y>y)` <- nrow(subset(XY, Y > y & X > x))/total_length)## [1] 0.4260274
- \(P(X<x | Y>y)\)
Y_greater_y <- data.frame(subset(XY, Y > y))
len_Y_greater_y <- nrow(Y_greater_y)
X_greater_x <- subset(Y_greater_y, X < x)
len_X_greater_x <- nrow(X_greater_x)
(`P(X<x | Y>y)` <- len_X_greater_x/len_Y_greater_y)## [1] 0.1428571
Does splitting the training data in this fashion make them independent? In other words, does P(XY)=P(X)P(Y))? Check mathematically, and then evaluate by running a Chi Square test for association. You might have to research this.
No, they are not equal and are not independent.
\(P(XY) \neq P(X)P(Y))\)
(`P(X)P(Y))` <- sum(X > x)/total_length * sum(Y > y)/total_length)## [1] 0.372948
`P(X>x & Y>y)` == `P(X)P(Y))`## [1] FALSE
Because the p-value is near zero, we reject the null hypothesis that X and Y have no association
(chi_input <- table(X > x, Y > y))##
## FALSE TRUE
## FALSE 262 106
## TRUE 470 622
prop.table(chi_input)##
## FALSE TRUE
## FALSE 0.17945205 0.07260274
## TRUE 0.32191781 0.42602740
(chi_results <- chisq.test(chi_input))##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: chi_input
## X-squared = 86.155, df = 1, p-value < 2.2e-16
Descriptive and Inferential Statistics.
Provide univariate descriptive statistics and appropriate plots for both variables. Provide a scatterplot of X and Y.
options(scipen = 999)
kable(t(round(describe(XY, quant = c(0.25, 0.75)), 2)))| X | Y | |
|---|---|---|
| vars | 1.00 | 2.00 |
| n | 1460.00 | 1460.00 |
| mean | 1162.63 | 180921.20 |
| sd | 386.59 | 79442.50 |
| median | 1087.00 | 163000.00 |
| trimmed | 1129.99 | 170783.29 |
| mad | 347.67 | 56338.80 |
| min | 334.00 | 34900.00 |
| max | 4692.00 | 755000.00 |
| range | 4358.00 | 720100.00 |
| skew | 1.37 | 1.88 |
| kurtosis | 5.71 | 6.50 |
| se | 10.12 | 2079.11 |
| Q0.25 | 882.00 | 129975.00 |
| Q0.75 | 1391.25 | 214000.00 |
XY_df <- data.frame(XY)
a <- ggplot(XY_df, aes(X, Y))
a + geom_point()# X Variable
hist(XY_df$X, freq = FALSE, breaks = 25, main = "Variable X")
xfit <- seq(min(XY_df$X), max(XY_df$X), by = 1)
yfit <- dnorm(xfit, mean(XY_df$X), sd(XY_df$X))
lines(xfit, yfit)# Y Variable
hist(XY_df$Y, freq = FALSE, breaks = 30, main = "Variable Y")
xfit <- seq(min(XY_df$Y), max(XY_df$Y), by = 10)
yfit <- dnorm(xfit, mean(XY_df$Y), sd(XY_df$Y))
lines(xfit, yfit)XYmodel <- lm(Y ~ X, data = XY_df)
summary(XYmodel)##
## Call:
## lm(formula = Y ~ X, data = XY_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -460330 -36494 -13164 36291 414547
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36173.447 5245.728 6.896 0.00000000000795 ***
## X 124.501 4.282 29.078 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 63220 on 1458 degrees of freedom
## Multiple R-squared: 0.3671, Adjusted R-squared: 0.3666
## F-statistic: 845.5 on 1 and 1458 DF, p-value: < 0.00000000000000022
plot(XYmodel, which = 2)sresid <- studres(XYmodel)
hist(sresid, freq = FALSE, breaks = 25, main = "Distribution of Studentized Residuals")
xfit <- seq(min(sresid), max(sresid), length = 50)
yfit <- dnorm(xfit)
lines(xfit, yfit)Box-Cox Transformations
Transform both variables simultaneously using Box-Cox transformations. You might have to research this.
The BC transformations make the relationship more linear and the distributions of X, Y and the residuals more normal
(lambda_values <- powerTransform(XY_df))## Estimated transformation parameters
## X Y
## -0.02413257 -0.02783079
lambda_X <- lambda_values$lambda[1]
lambda_Y <- lambda_values$lambda[2]
XY_df_transform <- data.frame(X = (XY_df$X^lambda_X - 1)/lambda_X, Y = (XY_df$Y^lambda_Y -
1)/lambda_Y)
summary(lambda_values)## bcPower Transformations to Multinormality
## Est Power Rounded Pwr Wald Lwr bnd Wald Upr Bnd
## X -0.0241 0 -0.1418 0.0935
## Y -0.0278 0 -0.1117 0.0560
##
## Likelihood ratio tests about transformation parameters
## LRT df pval
## LR test, lambda = (0 0) 0.5327201 2 0.7661632
## LR test, lambda = (1 1) 805.8357251 2 0.0000000
kable(t(round(describe(XY_df_transform, quant = c(0.25, 0.75)), 2)))| X | Y | |
|---|---|---|
| vars | 1.00 | 2.00 |
| n | 1460.00 | 1460.00 |
| mean | 6.45 | 10.22 |
| sd | 0.27 | 0.29 |
| median | 6.43 | 10.20 |
| trimmed | 6.45 | 10.21 |
| mad | 0.29 | 0.26 |
| min | 5.42 | 9.08 |
| max | 7.65 | 11.28 |
| range | 2.23 | 2.20 |
| skew | 0.05 | 0.07 |
| kurtosis | 0.13 | 0.81 |
| se | 0.01 | 0.01 |
| Q0.25 | 6.26 | 10.04 |
| Q0.75 | 6.64 | 10.40 |
a <- ggplot(XY_df_transform, aes(X, Y))
a + geom_point()# X Variable
hist(XY_df_transform$X, freq = FALSE, breaks = 25, main = "Variable X")
xfit <- seq(min(XY_df_transform$X), max(XY_df_transform$X), by = 0.01)
yfit <- dnorm(xfit, mean(XY_df_transform$X), sd(XY_df_transform$X))
lines(xfit, yfit)# Y Variable
hist(XY_df_transform$Y, freq = FALSE, breaks = 30, main = "Variable Y")
xfit <- seq(min(XY_df_transform$Y), max(XY_df_transform$Y), by = 0.01)
yfit <- dnorm(xfit, mean(XY_df_transform$Y), sd(XY_df_transform$Y))
lines(xfit, yfit)XYmodel_BC <- lm(Y ~ X, data = XY_df_transform)
plot(XYmodel_BC, which = 2)sresid <- studres(XYmodel_BC)
hist(sresid, freq = FALSE, breaks = 25, main = "Distribution of Studentized Residuals")
xfit <- seq(min(sresid), max(sresid), length = 50)
yfit <- dnorm(xfit)
lines(xfit, yfit)https://en.wikipedia.org/wiki/Power_transform#Box%E2%80%93Cox_transformation
http://rcompanion.org/handbook/I_12.html
http://www.statisticshowto.com/box-cox-transformation/
https://www.rdocumentation.org/packages/car/versions/2.1-6/topics/powerTransform
Linear Algebra and Correlation.
Using at least three untransformed variables, build a correlation matrix. Invert your correlation matrix. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix.
selected_vars <- subset(train_csv, select = c(LotArea, X1stFlrSF, OpenPorchSF,
SalePrice))
(cor_matrix <- cor(selected_vars))## LotArea X1stFlrSF OpenPorchSF SalePrice
## LotArea 1.00000000 0.2994746 0.08477381 0.2638434
## X1stFlrSF 0.29947458 1.0000000 0.21167123 0.6058522
## OpenPorchSF 0.08477381 0.2116712 1.00000000 0.3158562
## SalePrice 0.26384335 0.6058522 0.31585623 1.0000000
(precision_matrix <- solve(cor_matrix))## LotArea X1stFlrSF OpenPorchSF SalePrice
## LotArea 1.11163502 -0.24534251 0.00376011 -0.1458439
## X1stFlrSF -0.24534251 1.63521337 -0.03649825 -0.9144374
## OpenPorchSF 0.00376011 -0.03649825 1.11163864 -0.3299975
## SalePrice -0.14584387 -0.91443740 -0.32999752 1.6967256
round(cor_matrix %*% precision_matrix, 6)## LotArea X1stFlrSF OpenPorchSF SalePrice
## LotArea 1 0 0 0
## X1stFlrSF 0 1 0 0
## OpenPorchSF 0 0 1 0
## SalePrice 0 0 0 1
round(precision_matrix %*% cor_matrix, 6)## LotArea X1stFlrSF OpenPorchSF SalePrice
## LotArea 1 0 0 0
## X1stFlrSF 0 1 0 0
## OpenPorchSF 0 0 1 0
## SalePrice 0 0 0 1
Calculus-Based Probability & Statistics.
Many times, it makes sense to fit a closed form distribution to data. For your non-transformed independent variable, location shift (if necessary) it so that the minimum value is above zero. Then load the MASS package and run fitdistr to fit a density function of your choice. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of the parameters for this distribution, and then take 1000 samples from this distribution (e.g., rexp(1000, ??) for an exponential). Plot a histogram and compare it with a histogram of your non-transformed original variable.
set.seed(4)
X_fit_f <- fitdistr(X, "exponential")
hist(X, breaks = 30)rand_samp <- rexp(1000, X_fit_f$estimate[[1]])
hist(rand_samp, breaks = 30)Modeling.
Build some type of regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.
Clean Data
# combine test & training
min_test_id <- min(test_csv$Id)
test_csv$SalePrice <- NA
all_data <- rbind(train_csv, test_csv)
# mutate variables in train and test data sets
current_year <- as.integer(format(Sys.Date(), "%Y"))
all_data_cleaned <- all_data %>% # change the actual year values to age in unit years
mutate(years_since_built = current_year - YearBuilt, years_since_remodel = current_year -
YearRemodAdd, years_since_garage_built = current_year - as.integer(GarageYrBlt)) %>%
# Per the data dictionary, the following variables have meaningful NA values
mutate_at(vars(Alley, BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2,
FireplaceQu, GarageType, GarageFinish, GarageQual, GarageCond, PoolQC, Fence,
MiscFeature), fct_explicit_na, na_level = "None") %>% dplyr::select(-c(YearBuilt,
YearRemodAdd, GarageYrBlt))
# Remove 2 outliers due to being partial sales of homes
# https://ww2.amstat.org/publications/jse/v19n3/decock/DataDocumentation.txt
plot(SalePrice ~ GrLivArea, all_data_cleaned)rows_to_remove <- which(all_data_cleaned$GrLivArea > 4000 & all_data_cleaned$SalePrice <
2e+05)
all_data_cleaned <- all_data_cleaned[-rows_to_remove, ]
inspect_metadata <- function(df) {
### Takes a data frame & Checks for NAs, incorrect class types, inspects the
### unique values & adds summary metrics
df_len <- nrow(df)
NA_ct = as.vector(rapply(df, function(x) sum(is.na(x))))
var_names <- names(df)
# create dataframe
df_metadata <- data.frame(position = 1:length(var_names), var_names = var_names,
NA_ct = NA_ct, NA_percent = round(NA_ct/df_len, 4), class_type = rapply(df,
class), uniq_value_ct = rapply(df, function(x) length(unique(x))),
uniq_values = rapply(df, function(x) paste(sort(unique(x)), collapse = ",")))
metrics <- data.frame(describe(df))
factor_indices <- which(df_metadata$class_type == "factor")
metrics[factor_indices, ] <- ""
metrics <- metrics[, 2:ncol(metrics)]
df_summary <- cbind(df_metadata, metrics)
return(df_summary)
}
# View(df_metadata <- inspect_metadata(imputed_Data$ximp))Let’s use the missForest package to imput the missing values.
x_data <- subset(all_data_cleaned, select = -c(Id, SalePrice))
imputed_Data <- missForest(x_data)
# check imputed values
sum(is.na(imputed_Data$ximp))
dim(imputed_Data$ximp)
# check imputation error
imputed_Data$OOBerrorLet’s combine the variables again and split back into train and test sets.
all_data_completed <- cbind(Id = all_data_cleaned$Id, imputed_Data$ximp, SalePrice = all_data_cleaned$SalePrice)
# Split test & training
train_df <- subset(all_data_completed, Id < min_test_id, select = -Id)
test_df <- subset(all_data_completed, Id >= min_test_id, select = -c(Id, SalePrice))https://www.analyticsvidhya.com/blog/2016/03/tutorial-powerful-packages-imputing-missing-values/
Baseline Model
Create baseline model with all the variables
baseline <- lm(SalePrice ~ ., data = train_df)The model’s plots show some violations of LR conditions. The residuals are not normally distributed or homoscedastic.
par(mfrow = c(2, 2))
plot(baseline)With an adjusted R-squared of .93, this data has some potential, but we need to address the violations of LR assumptions.
model_summary <- function(model, y_var) {
### Summarizes the model's key statistics in one row
df_summary <- glance(summary(model))
df_summary$n_vars <- ncol(model$model) - 1
df_summary$RootMSLE = RMSLE(model$fitted.values, model$model[, y_var])
df_summary$model_name <- deparse(substitute(model))
df_summary <- subset(df_summary, select = c(model_name, n_vars, adj.r.squared,
statistic, RootMSLE, p.value, sigma, df))
}
mod_sum <- model_summary(baseline, "SalePrice")
kable(all_results <- mod_sum)| model_name | n_vars | adj.r.squared | statistic | RootMSLE | p.value | sigma | df |
|---|---|---|---|---|---|---|---|
| baseline | 79 | 0.9336397 | 71.93048 | 0.101834 | 0 | 20478.31 | 290 |
Model with Transformations
train_df_transformed <- baseline$model
PT <- powerTransform(as.formula(baseline$call), family = "bcnPower", data = train_df_transformed)
train_df_transformed$SalePrice <- bcnPower(train_df_transformed$SalePrice, lambda = PT$lambda,
gamma = PT$gamma)
baseline_PT <- lm(SalePrice ~ ., data = train_df_transformed)https://stats.stackexchange.com/questions/61217/transforming-variables-for-multiple-regression-in-r
The Q-Q plot isn’t as normal as it could be, and there are a number of outliers.
par(mfrow = c(2, 2))
plot(baseline_PT)mod_sum <- model_summary(baseline_PT, "SalePrice")The baseline_PT model has a larger adjusted R-squared value and F-statistic, but the diagnostics show substantial deviation from the LR assumptions.
kable(all_results <- rbind(all_results, mod_sum))| model_name | n_vars | adj.r.squared | statistic | RootMSLE | p.value | sigma | df |
|---|---|---|---|---|---|---|---|
| baseline | 79 | 0.9336397 | 71.93048 | 0.101834 | 0 | 2.047831e+04 | 290 |
| baseline_PT | 79 | 0.9478872 | 92.70106 | 0.000627 | 0 | 3.434700e-03 | 290 |
AIC Stepwise Model
# if(!exists('AIC_model'))
AIC_model <- stepAIC(baseline)The Q-Q plot isn’t normal, and there are a number of outliers.
par(mfrow = c(2, 2))
plot(AIC_model)The studentized residuals are not as normally distributed as we would prefer.
sresid <- studres(AIC_model)
hist(sresid, freq = FALSE, breaks = 30, main = "AIC_model of Studentized Residuals")
xfit <- seq(min(na.omit(sresid)), max(na.omit(sresid)), by = 0.1)
yfit <- dnorm(xfit)
lines(xfit, yfit)The Durbin Watson test’s p-value is >.05. Therefore, we fail to reject the null hypothesis of independence (no autocorrelation).
set.seed(40)
durbinWatsonTest(AIC_model)## lag Autocorrelation D-W Statistic p-value
## 1 0.03257928 1.934834 0.216
## Alternative hypothesis: rho != 0
The Non-constant Variance Score Test has a p-value of near zero, which indicates that we reject the null hypothesis of homoscedasticity.
ncvTest(AIC_model)## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 953.4205 Df = 1 p = 2.394437e-209
- The forward and backward stepwise selection of the
MASS::stepAICfunction reduces the model to 44 statistically significant variables. - Adjusted R-squared is about the same as the
baselineand a little less thanbaseline_PT. + While the F-statistic is much larger, we still have some issues with our assumptions to address.
mod_sum <- model_summary(AIC_model, "SalePrice")
kable(all_results <- rbind(all_results, mod_sum))| model_name | n_vars | adj.r.squared | statistic | RootMSLE | p.value | sigma | df |
|---|---|---|---|---|---|---|---|
| baseline | 79 | 0.9336397 | 71.93048 | 0.101834 | 0 | 2.047831e+04 | 290 |
| baseline_PT | 79 | 0.9478872 | 92.70106 | 0.000627 | 0 | 3.434700e-03 | 290 |
| AIC_model | 44 | 0.9343153 | 124.36151 | 0.108105 | 0 | 2.037380e+04 | 169 |
AIC Stepwise Model with Transformations
Let’s use the 44 predictors and see if a power transformation of the response variable will address some of the issues with the AIC_model.
train_df_AIC_PT <- AIC_model$model
PT <- powerTransform(as.formula(AIC_model$call), family = "bcnPower", data = train_df_AIC_PT)
summary(PT)
train_df_AIC_PT$SalePrice <- bcnPower(train_df_AIC_PT$SalePrice, lambda = PT$lambda,
gamma = PT$gamma)AIC_model_PT <- lm(SalePrice ~ ., data = train_df_AIC_PT)
# if(!exists('AIC_model_PT'))
AIC_model_PT <- stepAIC(AIC_model_PT)This model has more homoscedasticity in the residuals & the Q-Q plot is more normal, but some outliers remain. Let’s check the conditions with the following tests.
par(mfrow = c(2, 2))
plot(AIC_model_PT)The studentized residuals appear relatively normally distributed.
sresid <- studres(AIC_model_PT)
hist(sresid, freq = FALSE, breaks = 20, main = "AIC_model_PT of Studentized Residuals")
xfit <- seq(min(na.omit(sresid)), max(na.omit(sresid)), by = 0.1)
yfit <- dnorm(xfit)
lines(xfit, yfit)The Durbin Watson test’s p-value is >.05. Therefore, we fail to reject the null hypothesis of independence (no autocorrelation).
set.seed(40)
durbinWatsonTest(AIC_model_PT)## lag Autocorrelation D-W Statistic p-value
## 1 0.04790739 1.90385 0.066
## Alternative hypothesis: rho != 0
The Non-constant Variance Score Test has a p-value of >.05, which indicates that we would fail to reject the null hypothesis of homoscedasticity.
ncvTest(AIC_model_PT)## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 2.027105 Df = 1 p = 0.1545147
This time the AIC process narrowed the model down to 38 statistically significant predictors and our adjusted R-squared and F-statistic are higher than the previous models.
mod_sum <- model_summary(AIC_model_PT, "SalePrice")
kable(all_results <- rbind(all_results, mod_sum))| model_name | n_vars | adj.r.squared | statistic | RootMSLE | p.value | sigma | df |
|---|---|---|---|---|---|---|---|
| baseline | 79 | 0.9336397 | 71.93048 | 0.1018340 | 0 | 2.047831e+04 | 290 |
| baseline_PT | 79 | 0.9478872 | 92.70106 | 0.0006270 | 0 | 3.434700e-03 | 290 |
| AIC_model | 44 | 0.9343153 | 124.36151 | 0.1081050 | 0 | 2.037380e+04 | 169 |
| AIC_model_PT | 38 | 0.9453143 | 168.90771 | 0.0029938 | 0 | 2.821150e-02 | 151 |
AIC Stepwise Conclusion
Since it doesn’t appear that we can back-transform our response variable after using bcnPower, let’s predict the response values of the test set using the untransformed AIC_model and see how it does on Kaggle. It received a RMSLE score of 0.18310.
significant_predictors <- intersect(names(AIC_model$model), names(test_df))
AIC_model_fit <- predict(AIC_model, newdata = subset(test_df, select = significant_predictors))
kable(t(round(describe(AIC_model_fit), 2)))| X1 | |
|---|---|
| vars | 1.00 |
| n | 1459.00 |
| mean | 180061.06 |
| sd | 80115.93 |
| median | 158942.71 |
| trimmed | 169538.85 |
| mad | 57745.59 |
| min | 18303.62 |
| max | 765132.60 |
| range | 746828.98 |
| skew | 1.53 |
| kurtosis | 3.73 |
| se | 2097.45 |
AIC_model_fit_output <- data.frame(Id = test_csv$Id, SalePrice = AIC_model_fit)
write.csv(AIC_model_fit_output, "AIC_model_fit.csv", row.names = FALSE)https://stats.stackexchange.com/questions/117407/predict-after-using-box-cox-transformation
Random Forest
Let’s also try our train data set with imputed missing values in a Random Forest regression model. The model explains 89% of the variance in the response variable.
y_train <- train_df$SalePrice
x_train_df <- subset(train_df, select = -SalePrice)
(rf_fit <- randomForest(x_train_df, y = y_train, ntree = 500, importance = T))##
## Call:
## randomForest(x = x_train_df, y = y_train, ntree = 500, importance = T)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 26
##
## Mean of squared residuals: 670082579
## % Var explained: 89.39
plot(rf_fit)rf_model_fit <- predict(rf_fit, test_df)
# Summary of Predicted Output
kable(t(round(describe(rf_model_fit), 2)))| X1 | |
|---|---|
| vars | 1.00 |
| n | 1459.00 |
| mean | 179738.06 |
| sd | 70094.81 |
| median | 160144.00 |
| trimmed | 169784.84 |
| mad | 48830.13 |
| min | 69309.50 |
| max | 520506.73 |
| range | 451197.22 |
| skew | 1.54 |
| kurtosis | 2.86 |
| se | 1835.09 |
rf_model_fit_output <- data.frame(Id = test_csv$Id, SalePrice = rf_model_fit)
write.csv(rf_model_fit_output, "rf_model_fit.csv", row.names = FALSE)Conclusions
The Random Forest model received an Root Mean Squared Logistic Error score of 0.15497 (under username KyleGilde) and outperformed the untransformed AIC stepwise model, which received a score of 0.18310.