library(dplyr) # %>%
library(tidyr) # replace_na
library(gridExtra) # arrange()
library(ggplot2)
library(corrplot)
library(matrixcalc) # lu.decomposition()
library(MASS) # fitdistr()
library(caret) # train/trainControl, dummyVarsAmes is a city in Story County, Iowa, United States, located approximately 30 miles (48 km) north of Des Moines in central Iowa.
It is a small city with a population of 66,427. The most distinguishing characteristic of Ames is that it is the home of Iowa State University. With a student body of 33,391, and a staff of 16,000, it is safe to assume that the culture and lifestyle of the University is present throughout the city.
Read in the Data
file_train<-"C:\\Users\\arono\\source\\R\\Data605\\Final\\Problem3 Houses\\tmp\\train.csv"
file_test<-"C:\\Users\\arono\\source\\R\\Data605\\Final\\Problem3 Houses\\tmp\\test.csv"
ames_train_df <- read.csv(file_train)
ames_test_df <- read.csv(file_test)Data Prep:
The majority of fields are categorical so lets isolate the ones that arent. This is done by actual analysis. Some of the numerical fields are in fact categorical and some of the character fields are not
quant_vars<-c("LotFrontage","LotArea","OverallQual","OverallCond","YearBuilt","YearRemodAdd","MasVnrArea","BsmtFinSF1",
"BsmtFinSF2","BsmtUnfSF","TotalBsmtSF","GarageArea","GarageYrBlt", "MoSold","PoolArea","WoodDeckSF","OpenPorchSF",
"EnclosedPorch","X1stFlrSF","X2ndFlrSF","LowQualFinSF","GrLivArea","ScreenPorch","SalePrice")
ames_train_df <- ames_train_df %>% mutate(across(!quant_vars, as.factor))
quant_vars<-quant_vars [! quant_vars %in% "SalePrice"]
ames_test_df <- ames_test_df %>% mutate(across(!quant_vars, as.factor))
Which ones are null ?
all_nas<-colSums(sapply(ames_train_df[names(ames_train_df)], is.na))
all_nas[which(all_nas > 0)]## LotFrontage Alley MasVnrType MasVnrArea BsmtQual BsmtCond
## 259 1369 8 8 37 37
## BsmtExposure BsmtFinType1 BsmtFinType2 Electrical FireplaceQu GarageType
## 38 37 38 1 690 81
## GarageYrBlt GarageFinish GarageQual GarageCond PoolQC Fence
## 81 81 81 81 1453 1179
## MiscFeature
## 1406
The data set is filled with NAs but note the Data Description defines NA as Not Applicable i.e. no basement, no alley, no fireplace etc..
ames_train_df <- ames_train_df %>% mutate(
LotFrontage = ifelse(is.na(LotFrontage), 0, LotFrontage),
MasVnrArea = ifelse(is.na(MasVnrArea), 0, MasVnrArea),
GarageYrBlt = ifelse(is.na(MasVnrArea), mean(GarageYrBlt, na.rm = TRUE), MasVnrArea),
FireplaceQu = ifelse(is.na(FireplaceQu), "None", FireplaceQu),
PoolQC = ifelse(is.na(PoolQC), "None", PoolQC)
)
ames_test_df <- ames_test_df %>% mutate(
LotFrontage = ifelse(is.na(LotFrontage), 0, LotFrontage),
MasVnrArea = ifelse(is.na(MasVnrArea), 0, MasVnrArea),
GarageYrBlt = ifelse(is.na(MasVnrArea), mean(GarageYrBlt, na.rm = TRUE), MasVnrArea),
FireplaceQu = ifelse(is.na(FireplaceQu), "None", FireplaceQu),
PoolQC = ifelse(is.na(PoolQC), "None", PoolQC)
)Lets do some simple visualizations to get to know the data.
First the total distribution of prices…
hist(ames_train_df$SalePrice/1000, main="Ames Home Sales Prices", probability = TRUE, breaks=30, xlab="Sales Price (K)")So the distribution of prices is skewed right, somewhat normalized around 200K
Lets look at a few scatter plots
On the left we have the relationship between the Inner Area and the Price. On the right we look at the Lot Area and the Price.<.i>
library(grid)
plot1<-ggplot(data = ames_train_df, aes(y = SalePrice/1000 , x = (X1stFlrSF + X2ndFlrSF))) + geom_point() +
xlab("House (only) Square Footage") + ylab("Price (K)" )
plot2<-ggplot(data = ames_train_df, aes(y = SalePrice/1000 , x = LotArea)) + geom_point() +
xlab("Lot Square Footage") + ylab("Price (K)" )
grid.arrange(plot1, plot2, ncol = 2, top = textGrob("Square Footage and Price", gp=gpar(fontsize=20,font=3)))Here we look at 2 important amenities.<.i>
plot1<-ggplot(data = ames_train_df, aes(y = SalePrice/1000 , x = PoolArea)) + geom_point() +
xlab("Pool Square Footage") + ylab("Price (K)" )
plot2<-ggplot(data = ames_train_df, aes(y = SalePrice/1000 , x = WoodDeckSF)) + geom_point() +
xlab("Wood Deck Square Footage") + ylab("Price (K)" )
grid.arrange(plot1, plot2, ncol = 2 , top = textGrob("Amenities and Price", gp=gpar(fontsize=20,font=3)))Looking for collinearity issues… the relationship between Overall Condition and Overal Quality is not as strong as one might think
plot1<-ggplot(data = ames_train_df, aes(y = SalePrice/1000 , x = OverallCond)) + geom_point() +
xlab("Overall Cond") + ylab("Price (K)" )
plot2<-ggplot(data = ames_train_df, aes(y = SalePrice/1000 , x = OverallQual)) + geom_point() +
xlab("Overall Quality") + ylab("Price (K)" )
grid.arrange(plot1, plot2, ncol = 2, top = textGrob("Amenities and Price", gp=gpar(fontsize=20,font=3)))cor(ames_train_df$OverallQual,ames_train_df$OverallCond)## [1] -0.09193234
Lets look at a barchart of prices by 1) the age of each house and 2) size of each house
# fun fact : R assumes select comes from MASS, not dplyr so you need to be explicit
ames_decade_built<-ames_train_df %>%
mutate(built_decade=as.character(round((ames_train_df$YearBuilt/10),0) )) %>%
dplyr::select("SalePrice", "built_decade") %>%
group_by(built_decade) %>%
summarize(n(), sd=sd(SalePrice), avg_price=mean(SalePrice))
ames_sf<- ames_train_df %>%
mutate(total_sf=(round((TotalBsmtSF + WoodDeckSF + OpenPorchSF + X1stFlrSF + X2ndFlrSF)/1000,0)) ) %>%
dplyr::select("SalePrice", "total_sf") %>%
group_by(total_sf) %>%
summarize(n(), sd=sd(SalePrice), avg_price=mean(SalePrice))
plot1<-ggplot(data = ames_decade_built, aes(y = avg_price/1000, x=built_decade)) +
geom_bar(stat="identity") + xlab("Decade Built") + ylab("Price (K)" ) + theme(axis.text.x = element_text(angle = 60))
plot2<-ggplot(data = ames_sf, aes(y = avg_price/1000, x=total_sf)) +
geom_bar(stat="identity") + xlab("Square Foot (K)") + ylab("Price (K)")
grid.arrange(plot1, plot2, ncol = 2) We would prefer to use the parametric Pearsens Correlation. We can do that if that distributions are normal.
I deliberately chose 3 fields that were not only useful but also normally distributed and homoscedastic (varies equally across the distribution).
ames_corr<-ames_train_df %>%
dplyr::select("OverallQual","OverallCond", "X1stFlrSF")
par(mfcol = c(1, 3))
hist(ames_corr$OverallQual, main='Overall Qual')
hist(ames_corr$OverallCond, main='Overall Cond')
hist(ames_corr$X1stFlrSF, main='1st Floor SF')Run Pearsons Correlation
cor_matrix<-cor(ames_corr, method = c("pearson"))
cor_matrix## OverallQual OverallCond X1stFlrSF
## OverallQual 1.00000000 -0.09193234 0.4762238
## OverallCond -0.09193234 1.00000000 -0.1442028
## X1stFlrSF 0.47622383 -0.14420278 1.0000000
As previously discussed, there is virtually no relationship between Quality and Condition.
par( col = "blue", xaxp=c(-4,1,4),yaxp=c(0,1,4) )
x<-seq(-4, 4,0.1)
z1<- -4 # this is a constant, no need to change
z2<- -1.96 # this is the standard deviations... -1.96 is 95%
# qnorm(c(0.1, 0.9)) # run this to get a 2 tailed 80% interval (between -1.281552 and 1.281552)
z2<- -1.28
z_title<-paste("Hypothesis Test - Variables are Independent")
plot(x = x, y = dnorm(x,0,1),type = "l", xlab =" 80% Confidence Lvel", ylab="")
cord.1x<- c(z1,seq(z1,z2,0.01),z2)
cord.1y<- c(0,dnorm(seq(z1,z2,0.01)),0)
polygon(cord.1x,cord.1y,col='grey60') # fills in with grey color
z1<- 1.28
z2<- 4
cord.1x<- c(z1,seq(z1,z2,0.01),z2)
cord.1y<- c(0,dnorm(seq(z1,z2,0.01)),0)
polygon(cord.1x,cord.1y,col='grey60')
title(main=z_title, cex.main = 1.2, line=0.5, cex.lab=2)
Axis(side=1,at=seq(-4, 4, by = 1))
abline(v = 1.28, col="red", lwd=3, lty=2)
abline(v = -1.28, col="red", lwd=3, lty=2)
text(3.0, 0.16, "Positive")
text(3.0, 0.14, "Correlation")
text(-3.2, 0.16, "Negative")
text(-3.2, 0.14, "Correlation")Our null hypothesis is that the cor.test() function will return 0 with repeated sampling. If the pvalue > .2 ( 80% Confidence Level ) then we have statistical signficance, that is we can not say the correlation = 0.
ames_corr_subset<-sample_n(ames_corr, 200)
ames_corr_test<-cor.test(ames_corr_subset$OverallQual, ames_corr_subset$X1stFlrSF,method = "pearson", conf.level = .8)
print(sprintf("Sample 1 : Overall Quality vs First Floor SF P-Value= %.6f", ames_corr_test$p.value))
ames_corr_test<-cor.test(ames_corr_subset$OverallQual, ames_corr_subset$OverallCond,method = "pearson", conf.level = .8)
print(sprintf("Sample 1 : Overall Quality vs Overall Condition P-Value= %.6f", ames_corr_test$p.value))
ames_corr_subset<-sample_n(ames_corr, 200)
ames_corr_test<-cor.test(ames_corr_subset$OverallQual, ames_corr_subset$X1stFlrSF,method = "pearson", conf.level = .8)
print(sprintf("Sample 2 : Overall Quality vs First Floor SF P-Value= %.6f", ames_corr_test$p.value))
ames_corr_test<-cor.test(ames_corr_subset$OverallQual, ames_corr_subset$OverallCond,method = "pearson", conf.level = .8)
print(sprintf("Sample 2 : Overall Quality vs Overall Condition P-Value= %.6f", ames_corr_test$p.value))## [1] "Sample 1 : Overall Quality vs First Floor SF P-Value= 0.000000"
## [1] "Sample 1 : Overall Quality vs Overall Condition P-Value= 0.038945"
## [1] "Sample 2 : Overall Quality vs First Floor SF P-Value= 0.000000"
## [1] "Sample 2 : Overall Quality vs Overall Condition P-Value= 0.399526"
Family-wise error rate, a.k.a. type I errors, is when you mistakenly reject the null hypothesis.
When I run the test again and again, the p values vary, so there is a concern. Having said that, I feel the data set has integrity and we can make some reasonable assumptions.
A precision matrix is the inverse of a covariance matrix. It can be used in multivariate analysis although I dont see much on the internet detailing how its applied.
inv_cor_matrix<-solve(cor_matrix)
round(inv_cor_matrix%*%cor_matrix,2)
round(cor_matrix%*%inv_cor_matrix,2)## OverallQual OverallCond X1stFlrSF
## OverallQual 1 0 0
## OverallCond 0 1 0
## X1stFlrSF 0 0 1
## OverallQual OverallCond X1stFlrSF
## OverallQual 1 0 0
## OverallCond 0 1 0
## X1stFlrSF 0 0 1
Perform LU deomposition.
lu.decomposition(inv_cor_matrix)## $L
## [,1] [,2] [,3]
## [1,] 1.00000000 0.0000000 0
## [2,] 0.02375348 1.0000000 0
## [3,] -0.47279851 0.1442028 1
##
## $U
## [,1] [,2] [,3]
## [1,] 1.294233 0.03074254 -0.6119115
## [2,] 0.000000 1.02123603 0.1472651
## [3,] 0.000000 0.00000000 1.0000000
A closed form is a generic term for an expression in which the range is in the same set or follows the same rules as the domain.
Weve already seen that the sales prices are skewed right.
hist(ames_train_df$SalePrice/1000, main="Ames Home Sales Prices", probability = TRUE, breaks=30, xlab="Sales Price (K)")fitdistr returns estimated parameters for the requested distribution (i.e. normal would be mean and sd).
Run it for exponential distribution to get the best rate.
exp_fit<-fitdistr(ames_train_df$SalePrice,"exponential")
lambda<-exp_fit$estimate
lambda## rate
## 5.527268e-06
take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, λ)). Plot a histogram and compare it with a histogram of your original variable.
SalePrice_exp<-rexp(1000,lambda)
par(mfcol = c(1, 2))
hist(ames_train_df$SalePrice/1000, xlab="Sales Price (K)", main="Original")
hist(SalePrice_exp/1000, xlab="Sales Price (K)", main="Exp Fit")
Find the 5th and 95th percentiles using the cumulative distribution function (CDF)
pct_95<-qexp(.95, rate = lambda) # whats the 95th percentile.. 541723.7
pct_05<-qexp(.05, rate = lambda) # 9275.46
sprintf("The 5th and 95th percentiles in the continuous distribution are $%.2f and $%.2f", pct_05, pct_95)## [1] "The 5th and 95th percentiles in the continuous distribution are $9280.04 and $541991.47"
Also generate a 95% confidence interval from the empirical data, assuming normality.
lower<-qnorm(p=.025, mean = mean(ames_train_df$SalePrice), sd = sd(ames_train_df$SalePrice))
upper<-qnorm(p=.975, mean = mean(ames_train_df$SalePrice), sd = sd(ames_train_df$SalePrice))
sprintf("The 95%% Confidence Interval would be between $%.2f and $%.2f", lower,upper)## [1] "The 95% Confidence Interval would be between $25216.75 and $336625.64"
Finally, provide the empirical 5th percentile and 95th percentile of the data.
upper<-round(length(ames_train_df$SalePrice) *.95,0)
lower<-round(length(ames_train_df$SalePrice) *.05,0)
upper_price<-sort(ames_train_df$SalePrice)[upper+1]
lower_price<-sort(ames_train_df$SalePrice)[lower+1]
sprintf("95%% of all homes sold less than %.2f", upper_price)
sprintf("5%% of all homes sold for less than %.2f", upper_price)## [1] "95% of all homes sold less than 328000.00"
## [1] "5% of all homes sold for less than 328000.00"
Discuss
A confidence interval and percentiles are 2 different things.
Usually percentile is applied to empirical data to denote the percentage of all observations that are less than the rank of a given result.
\[percentile = \frac{rank \ of \ my \ result - 1}{total \ observations}\]
But to be in the 95th percentile your value would need to be greater than 95% of all values so I added 1 to the index to get those values.
Im not sure if there is a formal definition of a percentile on a continuous distribution
A distinguishing feature of the exponential distribution is that it starts high and immediately starts to tail off, representing a cumulative probability, usually its for rate problems such as what is the probability I can solve the problem in under 2 minutes.
Ideally there would be a better distribution available, maybe one that accepts a lower and uppder standard deviation and degrees of freedom.
Loop through all quantitative variables, and save off the correlation for review.
# ---------------- run correlation on all quantitative variables against the sales price
quant_cors <- data.frame( # create the dataframe schema
colum=character(),
cor=double()
)
for (i in quant_vars){
x<-ames_train_df[[i]]
y<-ames_train_df$SalePrice
cor_res<-cor(x,y)
quant_cors<-rbind(quant_cors, data.frame(column=i, cor=round(cor_res,4)))
}
tail(head(quant_cors[order(quant_cors$cor,decreasing = TRUE),]),n=5)## column cor
## 22 GrLivArea 0.7086
## 12 GarageArea 0.6234
## 11 TotalBsmtSF 0.6136
## 19 X1stFlrSF 0.6059
## 5 YearBuilt 0.5229
Loop through all non-quantitative variables, and save off the count, and the average price by factor level for review.
# Programming notes: Im looping through all column names not in quant_vars
# Then I use the sym() function to make the column name string an R symbol object
# and the !! is an R operator which acts as eval() on R symbol objects
# lastly pull() is used to convert the category levels, which is a workaround to rbind erros
# sym() is useful to treat the strings of column names as column objects to perform actions on
factor_cors <- data.frame( # create the dataframe schema
col=character(),
category=character(),
avg_price=double(),
count=integer()
)
for (i in colnames(ames_train_df)[!(colnames(ames_train_df) %in% quant_vars)])
{
i_sym<-sym(i)
res<-ames_train_df %>%
group_by(!!i_sym) %>%
summarize(avg_price=mean(SalePrice),count=n())
category<-pull(res[,1])
df2<-data.frame(col=i, category=category, avg_price=res[,2], count=res[,3])
factor_cors<-rbind(factor_cors,df2)
}
head(factor_cors[order(factor_cors$avg_price,decreasing = TRUE),],n=5)## col category avg_price count
## 692 Id 692 755000 1
## 2499 SalePrice <NA> 755000 1
## 1183 Id 1183 745000 1
## 2498 SalePrice <NA> 745000 1
## 1170 Id 1170 625000 1
After weve reviewed the data, we can isolate the following key variables, for review
Not clear how Quality differs from Condition. Had a .79 correlation with SalePrice.
Above grade (ground) living area. Not clear how it overlaps with other SF but had a .7 correlation with SalePrice.
First Floor SF (some homes have no Second Floor. Had a .6 correlation with SalePrice.
Very strong correlation with SalePrice
Very strong correlation with SalePrice
Strong correlation with SalePrice
Mild correlation with SalePrice
Mild correlation with SalePrice
Correlations between categorical and quantitative variables are very suspect. Having said that, convert the categories to numeric and display corrplot, for review
candidates=c("GrLivArea", "X1stFlrSF", "OverallQual", "ExterCond", "FullBath", "FireplaceQu", "SaleCondition", "GarageCars", "OverallCond", "CentralAir", "PoolQC")
ames_corr<-ames_train_df %>%
dplyr::select(all_of(candidates)) %>%
mutate(ExterCond=as.numeric(as.factor(ExterCond)),
FullBath=as.numeric(as.factor(FullBath)),
FireplaceQu=as.numeric(as.factor(FireplaceQu)),
SaleCondition=as.numeric(as.factor(SaleCondition)),
GarageCars=as.numeric(as.factor(GarageCars)),
OverallCond=as.numeric(as.factor(OverallCond)),
CentralAir=as.numeric(as.factor(CentralAir)),
PoolQC=as.numeric(as.factor(PoolQC))
)
ames_correlations<-cor(ames_corr, method = c("pearson"))
corrplot(ames_correlations, method="color")Model Function
Note: This code is a template to test models. It compares predictions to train actual. I played with it a bunch.
run_model<-function(mod) {
train.control <- trainControl(method = "cv", number = 10, verboseIter = FALSE)
model <- train(mod, data = ames_train_df, method = "lm", trControl = train.control)
pred <- predict(model, ames_train_df)
actual <- ames_train_df$SalePrice
rmse<-sqrt(mean((actual - pred)^2)) # 43069.61
corr<-cor(actual,pred) # 0.840163
return(c(corr,rmse))
}Run Models on all Combinations. Pick the best.
# Programming notes: much like i used sym() earlier to translate a string to an object, here im using eval/parse to do the same for a model
all_models <- data.frame(
model=character(),
cor=double(),
rmse=double()
)
# note: combos of 8 thru 11 (of 11) is 165, 55,11,1 = 232 total models
for (i in 9:length(candidates)) {
combos<-combn(candidates, i)
for (i in 1:ncol(combos)) {
i_str<-"SalePrice~"
for (j in 1:nrow(combos))
{
if (j>1) {
i_str<-paste(i_str, " + ", combos[j,i])
} else {
i_str<-paste(i_str, combos[j,i])
}
}
res<-run_model(eval(parse(text=i_str)))
all_models<-rbind(all_models, data.frame(model=i_str, cor=round(res[1],4), rmse=round(res[2],0)))
}
}
# 5 best
head(all_models[order(all_models$cor,decreasing = TRUE),],n=3)
write.csv(all_models,"C:\\Users\\arono\\source\\R\\Data605\\Final\\Problem3 Houses\\tmp\\all_models2.csv", row.names = TRUE)## model
## 67 SalePrice~ GrLivArea + X1stFlrSF + OverallQual + ExterCond + FullBath + FireplaceQu + SaleCondition + GarageCars + OverallCond + CentralAir + PoolQC
## 63 SalePrice~ GrLivArea + X1stFlrSF + OverallQual + FullBath + FireplaceQu + SaleCondition + GarageCars + OverallCond + CentralAir + PoolQC
## 58 SalePrice~ GrLivArea + X1stFlrSF + OverallQual + ExterCond + FullBath + FireplaceQu + SaleCondition + GarageCars + CentralAir + PoolQC
## cor rmse
## 67 0.8958 35289
## 63 0.8957 35315
## 58 0.8949 35439
Note: My original model failed on the test dataset. I leveraged this Kaggle submission and wound up coding something a little more advanced.
Start from scratch. Reread the files…
ames_train_df <- read.csv(file_train)
ames_test_df <- read.csv(file_test)
# the NAs are mostly "Non Applicables" i.e. not actually missing
ames_train_df[is.na(ames_train_df)] <- "none"
ames_test_df[is.na(ames_test_df)] <- "none"
# save the TestID well need this to submit the predicitons
ames_test_Id <- ames_test_df$Id
# save the SalePrice well be removing it to create the model
SalePrice <- ames_train_df$SalePrice
# sepearate the quants from the cats
quant_vars<-c("LotFrontage","LotArea","OverallQual","OverallCond","YearBuilt","YearRemodAdd","MasVnrArea","BsmtFinSF1",
"BsmtFinSF2","BsmtUnfSF","TotalBsmtSF","GarageArea","GarageYrBlt", "MoSold","PoolArea","WoodDeckSF","OpenPorchSF",
"EnclosedPorch","X1stFlrSF","X2ndFlrSF","LowQualFinSF","GrLivArea","ScreenPorch","Id")
ames_train_cat<-ames_train_df[, !names(ames_train_df) %in% quant_vars]
ames_train_quant<-ames_train_df[, names(ames_train_df) %in% quant_vars]
ames_test_cat<-ames_test_df[, !names(ames_test_df) %in% quant_vars]
ames_test_quant<-ames_test_df[, names(ames_test_df) %in% quant_vars]
# -----------------------------------------------------------------------------------------------------
# Post Analysis these are the fields for our model, probably could do more but these look pretty good
explanatory=c("GrLivArea", "X1stFlrSF", "OverallQual", "ExterCond", "FullBath", "FireplaceQu", "SaleCondition", "GarageCars", "OverallCond", "CentralAir", "PoolQC")
need<-c(explanatory,'SalePrice')
# removes columns we dont need
# ames_train_df <-ames_train_df[, names(ames_train_df) %in% need]
# ames_test_df <-ames_test_df[, names(ames_test_df) %in% need]
ames_train_cat<-ames_train_cat[, names(ames_train_cat) %in% need]
ames_train_quant<-ames_train_quant[, names(ames_train_quant) %in% need]
ames_test_cat<-ames_test_cat[, names(ames_test_cat) %in% need]
ames_test_quant<-ames_test_quant[, names(ames_test_quant) %in% need]
# ---------------------- cats
# dummyVars is a caret function that translates factors to integers 0,1,2...
ames_train_cat_imputed <- dummyVars(~., ames_train_cat) %>%
predict(ames_train_cat) %>%
data.frame
ames_test_cat_imputed <- dummyVars(~., ames_test_cat) %>%
predict(ames_test_cat) %>%
data.frame
ames_train_cat_imputed_prep <- preProcess(ames_train_cat_imputed,method = c("zv","nzv","medianImpute"),cutoff = 0.8 ,freqCut = 2, uniqueCut = 20)
# this is his test_cat_imputed <- predict(train_cat_prep, test_dummy)
# note we predict train/train to get train .... but train/test to get test...
train_cat_pred <- predict(ames_train_cat_imputed_prep,ames_train_cat_imputed)
test_cat_pred <- predict(ames_train_cat_imputed_prep,ames_test_cat_imputed)
class(ames_train_cat_imputed_prep) # preProcess
class(ames_test_cat_imputed) #df
# -------------- quants
# i guess we dont need to dummyVar the quant
ames_train_quant_prep <- preProcess(ames_train_quant,method = c("zv","corr","knnImpute"),
cutoff = 0.8 ,freqCut = 2, uniqueCut = 20)
# he calls this num_imputed
train_num_pred <- predict(ames_train_quant_prep,ames_train_quant)
test_num_pred <- predict(ames_train_quant_prep, ames_test_quant)
# combine
train_imputed <- cbind(train_cat_pred, train_num_pred,SalePrice = SalePrice)
test_imputed <- cbind(test_cat_pred, test_num_pred)
cvFolds = createFolds(SalePrice, k = 5)
boot_trControl <- trainControl(predictionBounds = c(min(SalePrice),max(SalePrice)),
method ="boot",
number = 10,
selectionFunction ="best",
index = cvFolds,
search = "random",
verboseIter = FALSE,
savePredictions = TRUE)
gbm_model <- train(SalePrice ~. , train_imputed,
method = "gbm",
trControl = boot_trControl,
tuneLength = 10,
verbose = FALSE
) %>%
suppressWarnings
test_imputed<-cbind(ames_test_Id,test_imputed)
pred_take_two <- data.frame(Id = ames_test_Id, SalePrice = predict(gbm_model,test_imputed))
write.csv(pred_take_two,"C:\\Users\\arono\\source\\R\\Data605\\Final\\Problem3 Houses\\tmp\\predictions_take_two.csv", row.names = TRUE)## [1] "preProcess"
## [1] "data.frame"