library(dplyr)          # %>%
library(tidyr)          # replace_na
library(gridExtra)      # arrange()
library(ggplot2)
library(corrplot)
library(matrixcalc)     # lu.decomposition()
library(MASS)           # fitdistr()
library(caret)          # train/trainControl, dummyVars

Ames is a city in Story County, Iowa, United States, located approximately 30 miles (48 km) north of Des Moines in central Iowa.

It is a small city with a population of 66,427. The most distinguishing characteristic of Ames is that it is the home of Iowa State University. With a student body of 33,391, and a staff of 16,000, it is safe to assume that the culture and lifestyle of the University is present throughout the city.

Data Prep

Read in the Data

file_train<-"C:\\Users\\arono\\source\\R\\Data605\\Final\\Problem3 Houses\\tmp\\train.csv"

file_test<-"C:\\Users\\arono\\source\\R\\Data605\\Final\\Problem3 Houses\\tmp\\test.csv"

ames_train_df <- read.csv(file_train)


ames_test_df <- read.csv(file_test)

Data Prep:

Seperate the Category vs Quant fields

The majority of fields are categorical so lets isolate the ones that arent. This is done by actual analysis. Some of the numerical fields are in fact categorical and some of the character fields are not

quant_vars<-c("LotFrontage","LotArea","OverallQual","OverallCond","YearBuilt","YearRemodAdd","MasVnrArea","BsmtFinSF1",
              "BsmtFinSF2","BsmtUnfSF","TotalBsmtSF","GarageArea","GarageYrBlt", "MoSold","PoolArea","WoodDeckSF","OpenPorchSF",
              "EnclosedPorch","X1stFlrSF","X2ndFlrSF","LowQualFinSF","GrLivArea","ScreenPorch","SalePrice")


ames_train_df <- ames_train_df %>% mutate(across(!quant_vars, as.factor))

quant_vars<-quant_vars [! quant_vars %in% "SalePrice"]

ames_test_df <- ames_test_df %>% mutate(across(!quant_vars, as.factor))

Update null values as appropriate.

Which ones are null ?

all_nas<-colSums(sapply(ames_train_df[names(ames_train_df)], is.na))
all_nas[which(all_nas > 0)]

##  LotFrontage        Alley   MasVnrType   MasVnrArea     BsmtQual     BsmtCond 
##          259         1369            8            8           37           37 
## BsmtExposure BsmtFinType1 BsmtFinType2   Electrical  FireplaceQu   GarageType 
##           38           37           38            1          690           81 
##  GarageYrBlt GarageFinish   GarageQual   GarageCond       PoolQC        Fence 
##           81           81           81           81         1453         1179 
##  MiscFeature 
##         1406

The data set is filled with NAs but note the Data Description defines NA as Not Applicable i.e. no basement, no alley, no fireplace etc..

ames_train_df <- ames_train_df %>% mutate(
                    LotFrontage = ifelse(is.na(LotFrontage), 0, LotFrontage),
                    MasVnrArea = ifelse(is.na(MasVnrArea), 0, MasVnrArea),
                    GarageYrBlt = ifelse(is.na(MasVnrArea), mean(GarageYrBlt, na.rm = TRUE), MasVnrArea),
                    FireplaceQu = ifelse(is.na(FireplaceQu), "None", FireplaceQu),
                    PoolQC = ifelse(is.na(PoolQC), "None", PoolQC)
                  )

ames_test_df <- ames_test_df %>% mutate(
                    LotFrontage = ifelse(is.na(LotFrontage), 0, LotFrontage),
                    MasVnrArea = ifelse(is.na(MasVnrArea), 0, MasVnrArea),
                    GarageYrBlt = ifelse(is.na(MasVnrArea), mean(GarageYrBlt, na.rm = TRUE), MasVnrArea),
                    FireplaceQu = ifelse(is.na(FireplaceQu), "None", FireplaceQu),
                    PoolQC = ifelse(is.na(PoolQC), "None", PoolQC)
                  )

Descriptive Visualisations

Lets do some simple visualizations to get to know the data.

First the total distribution of prices…

hist(ames_train_df$SalePrice/1000, main="Ames Home Sales Prices", probability = TRUE, breaks=30, xlab="Sales Price (K)")

So the distribution of prices is skewed right, somewhat normalized around 200K

Lets look at a few scatter plots

On the left we have the relationship between the Inner Area and the Price. On the right we look at the Lot Area and the Price.<.i>

library(grid) plot1<-ggplot(data = ames_train_df, aes(y = SalePrice/1000 , x = (X1stFlrSF + X2ndFlrSF))) + geom_point() + xlab("House (only) Square Footage") + ylab("Price (K)" ) plot2<-ggplot(data = ames_train_df, aes(y = SalePrice/1000 , x = LotArea)) + geom_point() + xlab("Lot Square Footage") + ylab("Price (K)" ) grid.arrange(plot1, plot2, ncol = 2, top = textGrob("Square Footage and Price", gp=gpar(fontsize=20,font=3)))

Here we look at 2 important amenities.<.i>

plot1<-ggplot(data = ames_train_df, aes(y = SalePrice/1000 , x = PoolArea)) + geom_point() + xlab("Pool Square Footage") + ylab("Price (K)" ) plot2<-ggplot(data = ames_train_df, aes(y = SalePrice/1000 , x = WoodDeckSF)) + geom_point() + xlab("Wood Deck Square Footage") + ylab("Price (K)" ) grid.arrange(plot1, plot2, ncol = 2 , top = textGrob("Amenities and Price", gp=gpar(fontsize=20,font=3)))

Looking for collinearity issues… the relationship between Overall Condition and Overal Quality is not as strong as one might think

plot1<-ggplot(data = ames_train_df, aes(y = SalePrice/1000 , x = OverallCond)) + geom_point() + xlab("Overall Cond") + ylab("Price (K)" ) plot2<-ggplot(data = ames_train_df, aes(y = SalePrice/1000 , x = OverallQual)) + geom_point() + xlab("Overall Quality") + ylab("Price (K)" ) grid.arrange(plot1, plot2, ncol = 2, top = textGrob("Amenities and Price", gp=gpar(fontsize=20,font=3)))

cor(ames_train_df$OverallQual,ames_train_df$OverallCond)

## [1] -0.09193234

Lets look at a barchart of prices by 1) the age of each house and 2) size of each house

# fun fact : R assumes select comes from MASS, not dplyr so you need to be explicit ames_decade_built<-ames_train_df %>% mutate(built_decade=as.character(round((ames_train_df$YearBuilt/10),0) )) %>% dplyr::select("SalePrice", "built_decade") %>% group_by(built_decade) %>% summarize(n(), sd=sd(SalePrice), avg_price=mean(SalePrice)) ames_sf<- ames_train_df %>% mutate(total_sf=(round((TotalBsmtSF + WoodDeckSF + OpenPorchSF + X1stFlrSF + X2ndFlrSF)/1000,0)) ) %>% dplyr::select("SalePrice", "total_sf") %>% group_by(total_sf) %>% summarize(n(), sd=sd(SalePrice), avg_price=mean(SalePrice)) plot1<-ggplot(data = ames_decade_built, aes(y = avg_price/1000, x=built_decade)) + geom_bar(stat="identity") + xlab("Decade Built") + ylab("Price (K)" ) + theme(axis.text.x = element_text(angle = 60)) plot2<-ggplot(data = ames_sf, aes(y = avg_price/1000, x=total_sf)) + geom_bar(stat="identity") + xlab("Square Foot (K)") + ylab("Price (K)") grid.arrange(plot1, plot2, ncol = 2)

Correlation Matrix

We would prefer to use the parametric Pearsens Correlation. We can do that if that distributions are normal.

I deliberately chose 3 fields that were not only useful but also normally distributed and homoscedastic (varies equally across the distribution).

ames_corr<-ames_train_df %>% dplyr::select("OverallQual","OverallCond", "X1stFlrSF") par(mfcol = c(1, 3)) hist(ames_corr$OverallQual, main='Overall Qual') hist(ames_corr$OverallCond, main='Overall Cond') hist(ames_corr$X1stFlrSF, main='1st Floor SF')

Run Pearsons Correlation

cor_matrix<-cor(ames_corr, method = c("pearson")) cor_matrix

## OverallQual OverallCond X1stFlrSF ## OverallQual 1.00000000 -0.09193234 0.4762238 ## OverallCond -0.09193234 1.00000000 -0.1442028 ## X1stFlrSF 0.47622383 -0.14420278 1.0000000

As previously discussed, there is virtually no relationship between Quality and Condition.

Hypothesis Test and Familywise Error

par( col = "blue", xaxp=c(-4,1,4),yaxp=c(0,1,4) ) x<-seq(-4, 4,0.1) z1<- -4 # this is a constant, no need to change z2<- -1.96 # this is the standard deviations... -1.96 is 95% # qnorm(c(0.1, 0.9)) # run this to get a 2 tailed 80% interval (between -1.281552 and 1.281552) z2<- -1.28 z_title<-paste("Hypothesis Test - Variables are Independent") plot(x = x, y = dnorm(x,0,1),type = "l", xlab =" 80% Confidence Lvel", ylab="") cord.1x<- c(z1,seq(z1,z2,0.01),z2) cord.1y<- c(0,dnorm(seq(z1,z2,0.01)),0) polygon(cord.1x,cord.1y,col='grey60') # fills in with grey color z1<- 1.28 z2<- 4 cord.1x<- c(z1,seq(z1,z2,0.01),z2) cord.1y<- c(0,dnorm(seq(z1,z2,0.01)),0) polygon(cord.1x,cord.1y,col='grey60') title(main=z_title, cex.main = 1.2, line=0.5, cex.lab=2) Axis(side=1,at=seq(-4, 4, by = 1)) abline(v = 1.28, col="red", lwd=3, lty=2) abline(v = -1.28, col="red", lwd=3, lty=2) text(3.0, 0.16, "Positive") text(3.0, 0.14, "Correlation") text(-3.2, 0.16, "Negative") text(-3.2, 0.14, "Correlation")

Our null hypothesis is that the cor.test() function will return 0 with repeated sampling. If the pvalue > .2 ( 80% Confidence Level ) then we have statistical signficance, that is we can not say the correlation = 0.

ames_corr_subset<-sample_n(ames_corr, 200) ames_corr_test<-cor.test(ames_corr_subset$OverallQual, ames_corr_subset$X1stFlrSF,method = "pearson", conf.level = .8) print(sprintf("Sample 1 : Overall Quality vs First Floor SF P-Value= %.6f", ames_corr_test$p.value)) ames_corr_test<-cor.test(ames_corr_subset$OverallQual, ames_corr_subset$OverallCond,method = "pearson", conf.level = .8) print(sprintf("Sample 1 : Overall Quality vs Overall Condition P-Value= %.6f", ames_corr_test$p.value)) ames_corr_subset<-sample_n(ames_corr, 200) ames_corr_test<-cor.test(ames_corr_subset$OverallQual, ames_corr_subset$X1stFlrSF,method = "pearson", conf.level = .8) print(sprintf("Sample 2 : Overall Quality vs First Floor SF P-Value= %.6f", ames_corr_test$p.value)) ames_corr_test<-cor.test(ames_corr_subset$OverallQual, ames_corr_subset$OverallCond,method = "pearson", conf.level = .8) print(sprintf("Sample 2 : Overall Quality vs Overall Condition P-Value= %.6f", ames_corr_test$p.value))

## [1] "Sample 1 : Overall Quality vs First Floor SF P-Value= 0.000000" ## [1] "Sample 1 : Overall Quality vs Overall Condition P-Value= 0.038945" ## [1] "Sample 2 : Overall Quality vs First Floor SF P-Value= 0.000000" ## [1] "Sample 2 : Overall Quality vs Overall Condition P-Value= 0.399526"

Family-wise error rate, a.k.a. type I errors, is when you mistakenly reject the null hypothesis.

When I run the test again and again, the p values vary, so there is a concern. Having said that, I feel the data set has integrity and we can make some reasonable assumptions.

Precision Matrix

A precision matrix is the inverse of a covariance matrix. It can be used in multivariate analysis although I dont see much on the internet detailing how its applied.

inv_cor_matrix<-solve(cor_matrix) round(inv_cor_matrix%*%cor_matrix,2) round(cor_matrix%*%inv_cor_matrix,2)

## OverallQual OverallCond X1stFlrSF ## OverallQual 1 0 0 ## OverallCond 0 1 0 ## X1stFlrSF 0 0 1 ## OverallQual OverallCond X1stFlrSF ## OverallQual 1 0 0 ## OverallCond 0 1 0 ## X1stFlrSF 0 0 1

Perform LU deomposition.

lu.decomposition(inv_cor_matrix)

## $L ## [,1] [,2] [,3] ## [1,] 1.00000000 0.0000000 0 ## [2,] 0.02375348 1.0000000 0 ## [3,] -0.47279851 0.1442028 1 ## ## $U ## [,1] [,2] [,3] ## [1,] 1.294233 0.03074254 -0.6119115 ## [2,] 0.000000 1.02123603 0.1472651 ## [3,] 0.000000 0.00000000 1.0000000

Fitting the Exponential Distribution

A closed form is a generic term for an expression in which the range is in the same set or follows the same rules as the domain.

Weve already seen that the sales prices are skewed right.

hist(ames_train_df$SalePrice/1000, main="Ames Home Sales Prices", probability = TRUE, breaks=30, xlab="Sales Price (K)")

fitdistr returns estimated parameters for the requested distribution (i.e. normal would be mean and sd).

Run it for exponential distribution to get the best rate.

exp_fit<-fitdistr(ames_train_df$SalePrice,"exponential") lambda<-exp_fit$estimate lambda

## rate ## 5.527268e-06

take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, λ)). Plot a histogram and compare it with a histogram of your original variable.

SalePrice_exp<-rexp(1000,lambda) par(mfcol = c(1, 2)) hist(ames_train_df$SalePrice/1000, xlab="Sales Price (K)", main="Original") hist(SalePrice_exp/1000, xlab="Sales Price (K)", main="Exp Fit")

Find the 5th and 95th percentiles using the cumulative distribution function (CDF)

pct_95<-qexp(.95, rate = lambda) # whats the 95th percentile.. 541723.7 pct_05<-qexp(.05, rate = lambda) # 9275.46 sprintf("The 5th and 95th percentiles in the continuous distribution are $%.2f and $%.2f", pct_05, pct_95)

## [1] "The 5th and 95th percentiles in the continuous distribution are $9280.04 and $541991.47"

Also generate a 95% confidence interval from the empirical data, assuming normality.

lower<-qnorm(p=.025, mean = mean(ames_train_df$SalePrice), sd = sd(ames_train_df$SalePrice)) upper<-qnorm(p=.975, mean = mean(ames_train_df$SalePrice), sd = sd(ames_train_df$SalePrice)) sprintf("The 95%% Confidence Interval would be between $%.2f and $%.2f", lower,upper)

## [1] "The 95% Confidence Interval would be between $25216.75 and $336625.64"

Finally, provide the empirical 5th percentile and 95th percentile of the data.

upper<-round(length(ames_train_df$SalePrice) *.95,0) lower<-round(length(ames_train_df$SalePrice) *.05,0) upper_price<-sort(ames_train_df$SalePrice)[upper+1] lower_price<-sort(ames_train_df$SalePrice)[lower+1] sprintf("95%% of all homes sold less than %.2f", upper_price) sprintf("5%% of all homes sold for less than %.2f", upper_price)

## [1] "95% of all homes sold less than 328000.00" ## [1] "5% of all homes sold for less than 328000.00"

Discuss

A confidence interval and percentiles are 2 different things.

Usually percentile is applied to empirical data to denote the percentage of all observations that are less than the rank of a given result.

\[percentile = \frac{rank \ of \ my \ result - 1}{total \ observations}\]

But to be in the 95th percentile your value would need to be greater than 95% of all values so I added 1 to the index to get those values.

Im not sure if there is a formal definition of a percentile on a continuous distribution

A distinguishing feature of the exponential distribution is that it starts high and immediately starts to tail off, representing a cumulative probability, usually its for rate problems such as what is the probability I can solve the problem in under 2 minutes.

Ideally there would be a better distribution available, maybe one that accepts a lower and uppder standard deviation and degrees of freedom.

Kaggle Regression Competition

Loop through all quantitative variables, and save off the correlation for review.

# ---------------- run correlation on all quantitative variables against the sales price quant_cors <- data.frame( # create the dataframe schema colum=character(), cor=double() ) for (i in quant_vars){ x<-ames_train_df[[i]] y<-ames_train_df$SalePrice cor_res<-cor(x,y) quant_cors<-rbind(quant_cors, data.frame(column=i, cor=round(cor_res,4))) } tail(head(quant_cors[order(quant_cors$cor,decreasing = TRUE),]),n=5)

## column cor ## 22 GrLivArea 0.7086 ## 12 GarageArea 0.6234 ## 11 TotalBsmtSF 0.6136 ## 19 X1stFlrSF 0.6059 ## 5 YearBuilt 0.5229

Loop through all non-quantitative variables, and save off the count, and the average price by factor level for review.

# Programming notes: Im looping through all column names not in quant_vars # Then I use the sym() function to make the column name string an R symbol object # and the !! is an R operator which acts as eval() on R symbol objects # lastly pull() is used to convert the category levels, which is a workaround to rbind erros # sym() is useful to treat the strings of column names as column objects to perform actions on factor_cors <- data.frame( # create the dataframe schema col=character(), category=character(), avg_price=double(), count=integer() ) for (i in colnames(ames_train_df)[!(colnames(ames_train_df) %in% quant_vars)]) { i_sym<-sym(i) res<-ames_train_df %>% group_by(!!i_sym) %>% summarize(avg_price=mean(SalePrice),count=n()) category<-pull(res[,1]) df2<-data.frame(col=i, category=category, avg_price=res[,2], count=res[,3]) factor_cors<-rbind(factor_cors,df2) } head(factor_cors[order(factor_cors$avg_price,decreasing = TRUE),],n=5)

## col category avg_price count ## 692 Id 692 755000 1 ## 2499 SalePrice <NA> 755000 1 ## 1183 Id 1183 745000 1 ## 2498 SalePrice <NA> 745000 1 ## 1170 Id 1170 625000 1

After weve reviewed the data, we can isolate the following key variables, for review

Important Quant Variables

Overall Qual
Not clear how Quality differs from Condition. Had a .79 correlation with SalePrice.

GrLivArea
Above grade (ground) living area. Not clear how it overlaps with other SF but had a .7 correlation with SalePrice.

X1stFlrSF
First Floor SF (some homes have no Second Floor. Had a .6 correlation with SalePrice.

Important Categorical Variables

ExterCond
Very strong correlation with SalePrice

FullBath
Very strong correlation with SalePrice

FireplaceQu
Strong correlation with SalePrice

SaleCondition
Mild correlation with SalePrice

GarageCars
Mild correlation with SalePrice

Correlations between categorical and quantitative variables are very suspect. Having said that, convert the categories to numeric and display corrplot, for review

candidates=c("GrLivArea", "X1stFlrSF", "OverallQual", "ExterCond", "FullBath", "FireplaceQu", "SaleCondition", "GarageCars", "OverallCond", "CentralAir", "PoolQC") ames_corr<-ames_train_df %>% dplyr::select(all_of(candidates)) %>% mutate(ExterCond=as.numeric(as.factor(ExterCond)), FullBath=as.numeric(as.factor(FullBath)), FireplaceQu=as.numeric(as.factor(FireplaceQu)), SaleCondition=as.numeric(as.factor(SaleCondition)), GarageCars=as.numeric(as.factor(GarageCars)), OverallCond=as.numeric(as.factor(OverallCond)), CentralAir=as.numeric(as.factor(CentralAir)), PoolQC=as.numeric(as.factor(PoolQC)) ) ames_correlations<-cor(ames_corr, method = c("pearson")) corrplot(ames_correlations, method="color")

Build Models. Pick the best one.

Model Function

Note: This code is a template to test models. It compares predictions to train actual. I played with it a bunch.

run_model<-function(mod) { train.control <- trainControl(method = "cv", number = 10, verboseIter = FALSE) model <- train(mod, data = ames_train_df, method = "lm", trControl = train.control) pred <- predict(model, ames_train_df) actual <- ames_train_df$SalePrice rmse<-sqrt(mean((actual - pred)^2)) # 43069.61 corr<-cor(actual,pred) # 0.840163 return(c(corr,rmse)) }

Run Models on all Combinations. Pick the best.

# Programming notes: much like i used sym() earlier to translate a string to an object, here im using eval/parse to do the same for a model all_models <- data.frame( model=character(), cor=double(), rmse=double() ) # note: combos of 8 thru 11 (of 11) is 165, 55,11,1 = 232 total models for (i in 9:length(candidates)) { combos<-combn(candidates, i) for (i in 1:ncol(combos)) { i_str<-"SalePrice~" for (j in 1:nrow(combos)) { if (j>1) { i_str<-paste(i_str, " + ", combos[j,i]) } else { i_str<-paste(i_str, combos[j,i]) } } res<-run_model(eval(parse(text=i_str))) all_models<-rbind(all_models, data.frame(model=i_str, cor=round(res[1],4), rmse=round(res[2],0))) } } # 5 best head(all_models[order(all_models$cor,decreasing = TRUE),],n=3) write.csv(all_models,"C:\\Users\\arono\\source\\R\\Data605\\Final\\Problem3 Houses\\tmp\\all_models2.csv", row.names = TRUE)

## model ## 67 SalePrice~ GrLivArea + X1stFlrSF + OverallQual + ExterCond + FullBath + FireplaceQu + SaleCondition + GarageCars + OverallCond + CentralAir + PoolQC ## 63 SalePrice~ GrLivArea + X1stFlrSF + OverallQual + FullBath + FireplaceQu + SaleCondition + GarageCars + OverallCond + CentralAir + PoolQC ## 58 SalePrice~ GrLivArea + X1stFlrSF + OverallQual + ExterCond + FullBath + FireplaceQu + SaleCondition + GarageCars + CentralAir + PoolQC ## cor rmse ## 67 0.8958 35289 ## 63 0.8957 35315 ## 58 0.8949 35439

Generate the Kaggle Predictions.

Note: My original model failed on the test dataset. I leveraged this Kaggle submission and wound up coding something a little more advanced.

Start from scratch. Reread the files…

ames_train_df <- read.csv(file_train) ames_test_df <- read.csv(file_test) # the NAs are mostly "Non Applicables" i.e. not actually missing ames_train_df[is.na(ames_train_df)] <- "none" ames_test_df[is.na(ames_test_df)] <- "none" # save the TestID well need this to submit the predicitons ames_test_Id <- ames_test_df$Id # save the SalePrice well be removing it to create the model SalePrice <- ames_train_df$SalePrice # sepearate the quants from the cats quant_vars<-c("LotFrontage","LotArea","OverallQual","OverallCond","YearBuilt","YearRemodAdd","MasVnrArea","BsmtFinSF1", "BsmtFinSF2","BsmtUnfSF","TotalBsmtSF","GarageArea","GarageYrBlt", "MoSold","PoolArea","WoodDeckSF","OpenPorchSF", "EnclosedPorch","X1stFlrSF","X2ndFlrSF","LowQualFinSF","GrLivArea","ScreenPorch","Id") ames_train_cat<-ames_train_df[, !names(ames_train_df) %in% quant_vars] ames_train_quant<-ames_train_df[, names(ames_train_df) %in% quant_vars] ames_test_cat<-ames_test_df[, !names(ames_test_df) %in% quant_vars] ames_test_quant<-ames_test_df[, names(ames_test_df) %in% quant_vars] # ----------------------------------------------------------------------------------------------------- # Post Analysis these are the fields for our model, probably could do more but these look pretty good explanatory=c("GrLivArea", "X1stFlrSF", "OverallQual", "ExterCond", "FullBath", "FireplaceQu", "SaleCondition", "GarageCars", "OverallCond", "CentralAir", "PoolQC") need<-c(explanatory,'SalePrice') # removes columns we dont need # ames_train_df <-ames_train_df[, names(ames_train_df) %in% need] # ames_test_df <-ames_test_df[, names(ames_test_df) %in% need] ames_train_cat<-ames_train_cat[, names(ames_train_cat) %in% need] ames_train_quant<-ames_train_quant[, names(ames_train_quant) %in% need] ames_test_cat<-ames_test_cat[, names(ames_test_cat) %in% need] ames_test_quant<-ames_test_quant[, names(ames_test_quant) %in% need] # ---------------------- cats # dummyVars is a caret function that translates factors to integers 0,1,2... ames_train_cat_imputed <- dummyVars(~., ames_train_cat) %>% predict(ames_train_cat) %>% data.frame ames_test_cat_imputed <- dummyVars(~., ames_test_cat) %>% predict(ames_test_cat) %>% data.frame ames_train_cat_imputed_prep <- preProcess(ames_train_cat_imputed,method = c("zv","nzv","medianImpute"),cutoff = 0.8 ,freqCut = 2, uniqueCut = 20) # this is his test_cat_imputed <- predict(train_cat_prep, test_dummy) # note we predict train/train to get train .... but train/test to get test... train_cat_pred <- predict(ames_train_cat_imputed_prep,ames_train_cat_imputed) test_cat_pred <- predict(ames_train_cat_imputed_prep,ames_test_cat_imputed) class(ames_train_cat_imputed_prep) # preProcess class(ames_test_cat_imputed) #df # -------------- quants # i guess we dont need to dummyVar the quant ames_train_quant_prep <- preProcess(ames_train_quant,method = c("zv","corr","knnImpute"), cutoff = 0.8 ,freqCut = 2, uniqueCut = 20) # he calls this num_imputed train_num_pred <- predict(ames_train_quant_prep,ames_train_quant) test_num_pred <- predict(ames_train_quant_prep, ames_test_quant) # combine train_imputed <- cbind(train_cat_pred, train_num_pred,SalePrice = SalePrice) test_imputed <- cbind(test_cat_pred, test_num_pred) cvFolds = createFolds(SalePrice, k = 5) boot_trControl <- trainControl(predictionBounds = c(min(SalePrice),max(SalePrice)), method ="boot", number = 10, selectionFunction ="best", index = cvFolds, search = "random", verboseIter = FALSE, savePredictions = TRUE) gbm_model <- train(SalePrice ~. , train_imputed, method = "gbm", trControl = boot_trControl, tuneLength = 10, verbose = FALSE ) %>% suppressWarnings test_imputed<-cbind(ames_test_Id,test_imputed) pred_take_two <- data.frame(Id = ames_test_Id, SalePrice = predict(gbm_model,test_imputed)) write.csv(pred_take_two,"C:\\Users\\arono\\source\\R\\Data605\\Final\\Problem3 Houses\\tmp\\predictions_take_two.csv", row.names = TRUE)

## [1] "preProcess" ## [1] "data.frame"

Final 3

Data Prep

Descriptive Visualisations

Correlation Matrix

Hypothesis Test and Familywise Error

Precision Matrix

Fitting the Exponential Distribution

Kaggle Regression Competition

Important Quant Variables

Important Categorical Variables

Build Models. Pick the best one.

Generate the Kaggle Predictions.