##              loaded_packages
## prettydoc               TRUE
## dplyr                   TRUE
## psych                   TRUE
## knitr                   TRUE
## ggplot2                 TRUE
## MASS                    TRUE
## car                     TRUE
## MLmetrics               TRUE
## broom                   TRUE
## randomForest            TRUE
## e1071                   TRUE
## forcats                 TRUE
## mice                    TRUE
## missForest              TRUE

Your final is due by the end of day on 12/20/2017. You should post your solutions to your GitHub account. You are also expected to make a short presentation during our last meeting (3-5 minutes) or post a recording to the board. This project will show off your ability to understand the elements of the class.

You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques .

I want you to do the following.

Pick one of the quantitative independent variables from the training data set (train.csv) , and define that variable as X. Pick SalePrice as the dependent variable, and define it as Y for the next analysis.

Probability.

# load CSVs changed 2 Likert scales & 2 date-component values to categorical
# variables
# https://ww2.amstat.org/publications/jse/v19n3/decock/DataDocumentation.txt

train_csv <- read.csv("train.csv", colClasses = c(MSSubClass = "factor", MoSold = "factor", 
    YrSold = "factor", OverallQual = "factor", OverallCond = "factor"))

test_csv <- read.csv("test.csv", colClasses = c(MSSubClass = "factor", MoSold = "factor", 
    YrSold = "factor", OverallQual = "factor", OverallCond = "factor"))

https://www.theanalysisfactor.com/likert-scale-items-as-predictor-variables-in-regression/ https://www.researchgate.net/post/how_independent_variables_measured_on_likert_scale_should_be_treated_in_binary_logistic_regression_as_continuous_variables_or_ordinal_variables https://www.researchgate.net/post/Is_a_Likert-type_scale_ordinal_or_interval_data https://stats.stackexchange.com/questions/121907/using-years-when-calculating-linear-regression

Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 1st quartile of the X variable, and the small letter “y” is estimated as the 2d quartile of the Y variable. Interpret the meaning of all probabilities.

X <- train_csv$X1stFlrSF
Y <- train_csv$SalePrice
XY <- cbind(X, Y)

total_length <- nrow(XY)
x <- quantile(X, 0.25)
y <- median(Y)

\(P(X>x | Y>y)\)

Y_greater_y <- data.frame(subset(XY, Y > y))

len_Y_greater_y <- nrow(Y_greater_y)

X_greater_x <- subset(Y_greater_y, X > x)

len_X_greater_x <- nrow(X_greater_x)

(`P(X>x | Y>y)` <- len_X_greater_x/len_Y_greater_y)

## [1] 0.8543956

\(P(X>x & Y>y)\)

(`P(X>x & Y>y)` <- nrow(subset(XY, Y > y & X > x))/total_length)

## [1] 0.4260274

\(P(X<x | Y>y)\)

Y_greater_y <- data.frame(subset(XY, Y > y))

len_Y_greater_y <- nrow(Y_greater_y)

X_greater_x <- subset(Y_greater_y, X < x)

len_X_greater_x <- nrow(X_greater_x)

(`P(X<x | Y>y)` <- len_X_greater_x/len_Y_greater_y)

## [1] 0.1428571

Does splitting the training data in this fashion make them independent? In other words, does P(XY)=P(X)P(Y))? Check mathematically, and then evaluate by running a Chi Square test for association. You might have to research this.

No, they are not equal and are not independent.

\(P(XY) \neq P(X)P(Y))\)

(`P(X)P(Y))` <- sum(X > x)/total_length * sum(Y > y)/total_length)

## [1] 0.372948

`P(X>x & Y>y)` == `P(X)P(Y))`

## [1] FALSE

Because the p-value is near zero, we reject the null hypothesis that X and Y have no association

(chi_input <- table(X > x, Y > y))

##        
##         FALSE TRUE
##   FALSE   262  106
##   TRUE    470  622

prop.table(chi_input)

##        
##              FALSE       TRUE
##   FALSE 0.17945205 0.07260274
##   TRUE  0.32191781 0.42602740

(chi_results <- chisq.test(chi_input))

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  chi_input
## X-squared = 86.155, df = 1, p-value < 2.2e-16

Descriptive and Inferential Statistics.

Provide univariate descriptive statistics and appropriate plots for both variables. Provide a scatterplot of X and Y.

options(scipen = 999)
kable(t(round(describe(XY, quant = c(0.25, 0.75)), 2)))

	X	Y
vars	1.00	2.00
n	1460.00	1460.00
mean	1162.63	180921.20
sd	386.59	79442.50
median	1087.00	163000.00
trimmed	1129.99	170783.29
mad	347.67	56338.80
min	334.00	34900.00
max	4692.00	755000.00
range	4358.00	720100.00
skew	1.37	1.88
kurtosis	5.71	6.50
se	10.12	2079.11
Q0.25	882.00	129975.00
Q0.75	1391.25	214000.00

XY_df <- data.frame(XY)

a <- ggplot(XY_df, aes(X, Y))
a + geom_point()

# X Variable
hist(XY_df$X, freq = FALSE, breaks = 25, main = "Variable X")
xfit <- seq(min(XY_df$X), max(XY_df$X), by = 1)
yfit <- dnorm(xfit, mean(XY_df$X), sd(XY_df$X))
lines(xfit, yfit)

# Y Variable
hist(XY_df$Y, freq = FALSE, breaks = 30, main = "Variable Y")
xfit <- seq(min(XY_df$Y), max(XY_df$Y), by = 10)
yfit <- dnorm(xfit, mean(XY_df$Y), sd(XY_df$Y))
lines(xfit, yfit)

XYmodel <- lm(Y ~ X, data = XY_df)
summary(XYmodel)

## 
## Call:
## lm(formula = Y ~ X, data = XY_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -460330  -36494  -13164   36291  414547 
## 
## Coefficients:
##              Estimate Std. Error t value             Pr(>|t|)    
## (Intercept) 36173.447   5245.728   6.896     0.00000000000795 ***
## X             124.501      4.282  29.078 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 63220 on 1458 degrees of freedom
## Multiple R-squared:  0.3671, Adjusted R-squared:  0.3666 
## F-statistic: 845.5 on 1 and 1458 DF,  p-value: < 0.00000000000000022

plot(XYmodel, which = 2)

sresid <- studres(XYmodel)
hist(sresid, freq = FALSE, breaks = 25, main = "Distribution of Studentized Residuals")
xfit <- seq(min(sresid), max(sresid), length = 50)
yfit <- dnorm(xfit)
lines(xfit, yfit)

Box-Cox Transformations

Transform both variables simultaneously using Box-Cox transformations. You might have to research this.

The BC transformations make the relationship more linear and the distributions of X, Y and the residuals more normal

(lambda_values <- powerTransform(XY_df))

## Estimated transformation parameters 
##           X           Y 
## -0.02413257 -0.02783079

lambda_X <- lambda_values$lambda[1]
lambda_Y <- lambda_values$lambda[2]

XY_df_transform <- data.frame(X = (XY_df$X^lambda_X - 1)/lambda_X, Y = (XY_df$Y^lambda_Y - 
    1)/lambda_Y)
summary(lambda_values)

## bcPower Transformations to Multinormality 
##   Est Power Rounded Pwr Wald Lwr bnd Wald Upr Bnd
## X   -0.0241           0      -0.1418       0.0935
## Y   -0.0278           0      -0.1117       0.0560
## 
## Likelihood ratio tests about transformation parameters
##                                 LRT df      pval
## LR test, lambda = (0 0)   0.5327201  2 0.7661632
## LR test, lambda = (1 1) 805.8357251  2 0.0000000

kable(t(round(describe(XY_df_transform, quant = c(0.25, 0.75)), 2)))

	X	Y
vars	1.00	2.00
n	1460.00	1460.00
mean	6.45	10.22
sd	0.27	0.29
median	6.43	10.20
trimmed	6.45	10.21
mad	0.29	0.26
min	5.42	9.08
max	7.65	11.28
range	2.23	2.20
skew	0.05	0.07
kurtosis	0.13	0.81
se	0.01	0.01
Q0.25	6.26	10.04
Q0.75	6.64	10.40

a <- ggplot(XY_df_transform, aes(X, Y))
a + geom_point()

# X Variable
hist(XY_df_transform$X, freq = FALSE, breaks = 25, main = "Variable X")
xfit <- seq(min(XY_df_transform$X), max(XY_df_transform$X), by = 0.01)
yfit <- dnorm(xfit, mean(XY_df_transform$X), sd(XY_df_transform$X))
lines(xfit, yfit)

# Y Variable
hist(XY_df_transform$Y, freq = FALSE, breaks = 30, main = "Variable Y")
xfit <- seq(min(XY_df_transform$Y), max(XY_df_transform$Y), by = 0.01)
yfit <- dnorm(xfit, mean(XY_df_transform$Y), sd(XY_df_transform$Y))
lines(xfit, yfit)

XYmodel_BC <- lm(Y ~ X, data = XY_df_transform)

plot(XYmodel_BC, which = 2)

sresid <- studres(XYmodel_BC)
hist(sresid, freq = FALSE, breaks = 25, main = "Distribution of Studentized Residuals")
xfit <- seq(min(sresid), max(sresid), length = 50)
yfit <- dnorm(xfit)
lines(xfit, yfit)

https://en.wikipedia.org/wiki/Power_transform#Box%E2%80%93Cox_transformation

http://rcompanion.org/handbook/I_12.html

http://www.statisticshowto.com/box-cox-transformation/

https://www.rdocumentation.org/packages/car/versions/2.1-6/topics/powerTransform

Linear Algebra and Correlation.

Using at least three untransformed variables, build a correlation matrix. Invert your correlation matrix. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix.

selected_vars <- subset(train_csv, select = c(LotArea, X1stFlrSF, OpenPorchSF, 
    SalePrice))

(cor_matrix <- cor(selected_vars))

##                LotArea X1stFlrSF OpenPorchSF SalePrice
## LotArea     1.00000000 0.2994746  0.08477381 0.2638434
## X1stFlrSF   0.29947458 1.0000000  0.21167123 0.6058522
## OpenPorchSF 0.08477381 0.2116712  1.00000000 0.3158562
## SalePrice   0.26384335 0.6058522  0.31585623 1.0000000

(precision_matrix <- solve(cor_matrix))

##                 LotArea   X1stFlrSF OpenPorchSF  SalePrice
## LotArea      1.11163502 -0.24534251  0.00376011 -0.1458439
## X1stFlrSF   -0.24534251  1.63521337 -0.03649825 -0.9144374
## OpenPorchSF  0.00376011 -0.03649825  1.11163864 -0.3299975
## SalePrice   -0.14584387 -0.91443740 -0.32999752  1.6967256

round(cor_matrix %*% precision_matrix, 6)

##             LotArea X1stFlrSF OpenPorchSF SalePrice
## LotArea           1         0           0         0
## X1stFlrSF         0         1           0         0
## OpenPorchSF       0         0           1         0
## SalePrice         0         0           0         1

round(precision_matrix %*% cor_matrix, 6)

##             LotArea X1stFlrSF OpenPorchSF SalePrice
## LotArea           1         0           0         0
## X1stFlrSF         0         1           0         0
## OpenPorchSF       0         0           1         0
## SalePrice         0         0           0         1

http://www.sthda.com/english/wiki/correlation-matrix-a-quick-start-guide-to-analyze-format-and-visualize-a-correlation-matrix-using-r-software

Calculus-Based Probability & Statistics.

Many times, it makes sense to fit a closed form distribution to data. For your non-transformed independent variable, location shift (if necessary) it so that the minimum value is above zero. Then load the MASS package and run fitdistr to fit a density function of your choice. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of the parameters for this distribution, and then take 1000 samples from this distribution (e.g., rexp(1000, ??) for an exponential). Plot a histogram and compare it with a histogram of your non-transformed original variable.

set.seed(4)
X_fit_f <- fitdistr(X, "exponential")
hist(X, breaks = 30)

rand_samp <- rexp(1000, X_fit_f$estimate[[1]])
hist(rand_samp, breaks = 30)

Modeling.

Build some type of regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.

Clean Data

# combine test & training
min_test_id <- min(test_csv$Id)
test_csv$SalePrice <- NA
all_data <- rbind(train_csv, test_csv)

# mutate variables in train and test data sets
current_year <- as.integer(format(Sys.Date(), "%Y"))

all_data_cleaned <- all_data %>% # change the actual year values to age in unit years
mutate(years_since_built = current_year - YearBuilt, years_since_remodel = current_year - 
    YearRemodAdd, years_since_garage_built = current_year - as.integer(GarageYrBlt)) %>% 
    # Per the data dictionary, the following variables have meaningful NA values
mutate_at(vars(Alley, BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2, 
    FireplaceQu, GarageType, GarageFinish, GarageQual, GarageCond, PoolQC, Fence, 
    MiscFeature), fct_explicit_na, na_level = "None") %>% dplyr::select(-c(YearBuilt, 
    YearRemodAdd, GarageYrBlt))

# Remove 2 outliers due to being partial sales of homes
# https://ww2.amstat.org/publications/jse/v19n3/decock/DataDocumentation.txt
plot(SalePrice ~ GrLivArea, all_data_cleaned)

rows_to_remove <- which(all_data_cleaned$GrLivArea > 4000 & all_data_cleaned$SalePrice < 
    2e+05)
all_data_cleaned <- all_data_cleaned[-rows_to_remove, ]


inspect_metadata <- function(df) {
    ### Takes a data frame & Checks for NAs, incorrect class types, inspects the
    ### unique values & adds summary metrics
    df_len <- nrow(df)
    NA_ct = as.vector(rapply(df, function(x) sum(is.na(x))))
    var_names <- names(df)
    # create dataframe
    df_metadata <- data.frame(position = 1:length(var_names), var_names = var_names, 
        NA_ct = NA_ct, NA_percent = round(NA_ct/df_len, 4), class_type = rapply(df, 
            class), uniq_value_ct = rapply(df, function(x) length(unique(x))), 
        uniq_values = rapply(df, function(x) paste(sort(unique(x)), collapse = ",")))
    metrics <- data.frame(describe(df))
    factor_indices <- which(df_metadata$class_type == "factor")
    metrics[factor_indices, ] <- ""
    metrics <- metrics[, 2:ncol(metrics)]
    df_summary <- cbind(df_metadata, metrics)
    return(df_summary)
}

# View(df_metadata <- inspect_metadata(imputed_Data$ximp))

Let’s use the missForest package to imput the missing values.

x_data <- subset(all_data_cleaned, select = -c(Id, SalePrice))

imputed_Data <- missForest(x_data)

# check imputed values
sum(is.na(imputed_Data$ximp))
dim(imputed_Data$ximp)
# check imputation error
imputed_Data$OOBerror

Let’s combine the variables again and split back into train and test sets.

all_data_completed <- cbind(Id = all_data_cleaned$Id, imputed_Data$ximp, SalePrice = all_data_cleaned$SalePrice)

# Split test & training
train_df <- subset(all_data_completed, Id < min_test_id, select = -Id)
test_df <- subset(all_data_completed, Id >= min_test_id, select = -c(Id, SalePrice))

https://www.analyticsvidhya.com/blog/2016/03/tutorial-powerful-packages-imputing-missing-values/

Baseline Model

Create baseline model with all the variables

baseline <- lm(SalePrice ~ ., data = train_df)

The model’s plots show some violations of LR conditions. The residuals are not normally distributed or homoscedastic.

par(mfrow = c(2, 2))
plot(baseline)

With an adjusted R-squared of .93, this data has some potential, but we need to address the violations of LR assumptions.

model_summary <- function(model, y_var) {
    ### Summarizes the model's key statistics in one row
    df_summary <- glance(summary(model))
    df_summary$n_vars <- ncol(model$model) - 1
    df_summary$RootMSLE = RMSLE(model$fitted.values, model$model[, y_var])
    df_summary$model_name <- deparse(substitute(model))
    df_summary <- subset(df_summary, select = c(model_name, n_vars, adj.r.squared, 
        statistic, RootMSLE, p.value, sigma, df))
}
mod_sum <- model_summary(baseline, "SalePrice")
kable(all_results <- mod_sum)

model_name	n_vars	adj.r.squared	statistic	RootMSLE	p.value	sigma	df
baseline	79	0.9336397	71.93048	0.101834	0	20478.31	290

Model with Transformations

train_df_transformed <- baseline$model

PT <- powerTransform(as.formula(baseline$call), family = "bcnPower", data = train_df_transformed)

train_df_transformed$SalePrice <- bcnPower(train_df_transformed$SalePrice, lambda = PT$lambda, 
    gamma = PT$gamma)

baseline_PT <- lm(SalePrice ~ ., data = train_df_transformed)

https://stats.stackexchange.com/questions/61217/transforming-variables-for-multiple-regression-in-r

The Q-Q plot isn’t as normal as it could be, and there are a number of outliers.

par(mfrow = c(2, 2))
plot(baseline_PT)

mod_sum <- model_summary(baseline_PT, "SalePrice")

The baseline_PT model has a larger adjusted R-squared value and F-statistic, but the diagnostics show substantial deviation from the LR assumptions.

kable(all_results <- rbind(all_results, mod_sum))

model_name	n_vars	adj.r.squared	statistic	RootMSLE	p.value	sigma	df
baseline	79	0.9336397	71.93048	0.101834	0	2.047831e+04	290
baseline_PT	79	0.9478872	92.70106	0.000627	0	3.434700e-03	290

AIC Stepwise Model

# if(!exists('AIC_model'))
AIC_model <- stepAIC(baseline)

The Q-Q plot isn’t normal, and there are a number of outliers.

par(mfrow = c(2, 2))
plot(AIC_model)

The studentized residuals are not as normally distributed as we would prefer.

sresid <- studres(AIC_model)
hist(sresid, freq = FALSE, breaks = 30, main = "AIC_model of Studentized Residuals")
xfit <- seq(min(na.omit(sresid)), max(na.omit(sresid)), by = 0.1)
yfit <- dnorm(xfit)
lines(xfit, yfit)

The Durbin Watson test’s p-value is >.05. Therefore, we fail to reject the null hypothesis of independence (no autocorrelation).

set.seed(40)
durbinWatsonTest(AIC_model)

##  lag Autocorrelation D-W Statistic p-value
##    1      0.03257928      1.934834   0.216
##  Alternative hypothesis: rho != 0

The Non-constant Variance Score Test has a p-value of near zero, which indicates that we reject the null hypothesis of homoscedasticity.

ncvTest(AIC_model)

## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 953.4205    Df = 1     p = 2.394437e-209

The forward and backward stepwise selection of the MASS::stepAIC function reduces the model to 44 statistically significant variables.
Adjusted R-squared is about the same as the baselineand a little less than baseline_PT. + While the F-statistic is much larger, we still have some issues with our assumptions to address.

mod_sum <- model_summary(AIC_model, "SalePrice")
kable(all_results <- rbind(all_results, mod_sum))

model_name	n_vars	adj.r.squared	statistic	RootMSLE	sigma	df
baseline	79	0.9336397	71.93048	0.101834	2.047831e+04	290
baseline_PT	79	0.9478872	92.70106	0.000627	3.434700e-03	290
AIC_model	44	0.9343153	124.36151	0.108105	2.037380e+04	169

AIC Stepwise Model with Transformations

Let’s use the 44 predictors and see if a power transformation of the response variable will address some of the issues with the AIC_model.

train_df_AIC_PT <- AIC_model$model
PT <- powerTransform(as.formula(AIC_model$call), family = "bcnPower", data = train_df_AIC_PT)
summary(PT)

train_df_AIC_PT$SalePrice <- bcnPower(train_df_AIC_PT$SalePrice, lambda = PT$lambda, 
    gamma = PT$gamma)

AIC_model_PT <- lm(SalePrice ~ ., data = train_df_AIC_PT)
# if(!exists('AIC_model_PT'))
AIC_model_PT <- stepAIC(AIC_model_PT)

This model has more homoscedasticity in the residuals & the Q-Q plot is more normal, but some outliers remain. Let’s check the conditions with the following tests.

par(mfrow = c(2, 2))
plot(AIC_model_PT)

The studentized residuals appear relatively normally distributed.

sresid <- studres(AIC_model_PT)
hist(sresid, freq = FALSE, breaks = 20, main = "AIC_model_PT of Studentized Residuals")
xfit <- seq(min(na.omit(sresid)), max(na.omit(sresid)), by = 0.1)
yfit <- dnorm(xfit)
lines(xfit, yfit)

The Durbin Watson test’s p-value is >.05. Therefore, we fail to reject the null hypothesis of independence (no autocorrelation).

set.seed(40)
durbinWatsonTest(AIC_model_PT)

##  lag Autocorrelation D-W Statistic p-value
##    1      0.04790739       1.90385   0.066
##  Alternative hypothesis: rho != 0

The Non-constant Variance Score Test has a p-value of >.05, which indicates that we would fail to reject the null hypothesis of homoscedasticity.

ncvTest(AIC_model_PT)

## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 2.027105    Df = 1     p = 0.1545147

This time the AIC process narrowed the model down to 38 statistically significant predictors and our adjusted R-squared and F-statistic are higher than the previous models.

mod_sum <- model_summary(AIC_model_PT, "SalePrice")
kable(all_results <- rbind(all_results, mod_sum))

model_name	n_vars	adj.r.squared	statistic	RootMSLE	sigma	df
baseline	79	0.9336397	71.93048	0.1018340	2.047831e+04	290
baseline_PT	79	0.9478872	92.70106	0.0006270	3.434700e-03	290
AIC_model	44	0.9343153	124.36151	0.1081050	2.037380e+04	169
AIC_model_PT	38	0.9453143	168.90771	0.0029938	2.821150e-02	151

AIC Stepwise Conclusion

Since it doesn’t appear that we can back-transform our response variable after using bcnPower, let’s predict the response values of the test set using the untransformed AIC_model and see how it does on Kaggle. It received a RMSLE score of 0.18310.

significant_predictors <- intersect(names(AIC_model$model), names(test_df))

AIC_model_fit <- predict(AIC_model, newdata = subset(test_df, select = significant_predictors))

kable(t(round(describe(AIC_model_fit), 2)))

	X1
vars	1.00
n	1459.00
mean	180061.06
sd	80115.93
median	158942.71
trimmed	169538.85
mad	57745.59
min	18303.62
max	765132.60
range	746828.98
skew	1.53
kurtosis	3.73
se	2097.45

AIC_model_fit_output <- data.frame(Id = test_csv$Id, SalePrice = AIC_model_fit)

write.csv(AIC_model_fit_output, "AIC_model_fit.csv", row.names = FALSE)

https://stats.stackexchange.com/questions/1713/express-answers-in-terms-of-original-units-in-box-cox-transformed-data

https://stats.stackexchange.com/questions/117407/predict-after-using-box-cox-transformation

Random Forest

Let’s also try our train data set with imputed missing values in a Random Forest regression model. The model explains 89% of the variance in the response variable.

y_train <- train_df$SalePrice
x_train_df <- subset(train_df, select = -SalePrice)

(rf_fit <- randomForest(x_train_df, y = y_train, ntree = 500, importance = T))

## 
## Call:
##  randomForest(x = x_train_df, y = y_train, ntree = 500, importance = T) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 26
## 
##           Mean of squared residuals: 670082579
##                     % Var explained: 89.39

plot(rf_fit)

rf_model_fit <- predict(rf_fit, test_df)

# Summary of Predicted Output
kable(t(round(describe(rf_model_fit), 2)))

	X1
vars	1.00
n	1459.00
mean	179738.06
sd	70094.81
median	160144.00
trimmed	169784.84
mad	48830.13
min	69309.50
max	520506.73
range	451197.22
skew	1.54
kurtosis	2.86
se	1835.09

rf_model_fit_output <- data.frame(Id = test_csv$Id, SalePrice = rf_model_fit)

write.csv(rf_model_fit_output, "rf_model_fit.csv", row.names = FALSE)

Conclusions

The Random Forest model received an Root Mean Squared Logistic Error score of 0.15497 (under username KyleGilde) and outperformed the untransformed AIC stepwise model, which received a score of 0.18310.

DATA 605 FUNDAMENTALS OF COMPUTATIONAL MATHEMATICS

Final Project

Kyle Gilde

12/11/2017