df <- read.csv("housing.csv")
df <- as.data.frame(df)
df2 <- df[,-c(6,7,73,74,75,10,40)]
full_df <- subset(df_22, select = -c(GrLivArea,TotalBsmtSF,Exterior1st))
kaggle_test <- as.data.frame(read.csv("test_housing.csv"))
Id_column <- kaggle_test$Id
## Get right skewed col
a <- colSums(df2 == 0)
right_skewed_cols <- names(a[a>200& is.na(a)==FALSE])
total_df_names <- names(a)
## get difference in columns
results1 = setdiff(total_df_names, right_skewed_cols)
## create new df
r_skew_df <- df2[,right_skewed_cols]
normal_df <- df2[,results1]
## Later analysis revealed several columns that should be categorized as factors
normal_df[,c("MSSubClass","OverallQual","OverallCond")] <- lapply(normal_df[,c("MSSubClass","OverallQual","OverallCond")], factor)
r_skew_df[,c("BsmtHalfBath","HalfBath",
"BsmtFullBath","Fireplaces")] <- lapply(r_skew_df[,c("BsmtHalfBath","HalfBath","BsmtFullBath","Fireplaces")], factor)
r_skew_df <- subset( r_skew_df, select = -c(BsmtHalfBath,HalfBath,
BsmtFullBath,Fireplaces) )
# get factor cols
is.fact <- sapply(normal_df, is.factor)
is.fact2 <- sapply(r_skew_df, is.factor)
factors_df <- normal_df[, is.fact]
factors_df2 <- r_skew_df[,is.fact2]
factors_df <- bind_cols(factors_df,factors_df2)
factors_df$SalePrice <- normal_df$SalePrice
##
factor_names <- names(factors_df)
normal_names <- names(normal_df)
results2 = setdiff(normal_names, factor_names)
numeric_predictors <- normal_df[,results2]
total_df <- bind_cols(numeric_predictors,factors_df,r_skew_df)
numeric_predictors$SalePrice <- normal_df$SalePrice
r_skew_df$SalePrice <- numeric_predictors$SalePrice
## Grab big df for later
#summary(total_df)
res <- cor(numeric_predictors,use= "complete.obs")
corrplot(res)
a <- 1:18
chunk <- function(x, n) split(x, sort(rank(x) %% n))
my_list <- chunk(a,4)
par(mfrow=c(3,3))
chart.Correlation(numeric_predictors[,c(my_list$'0',20)], histogram=TRUE, pch=19)
chart.Correlation(numeric_predictors[,c(my_list$'1',20)], histogram=TRUE, pch=19)
chart.Correlation(numeric_predictors[,c(my_list$'2',20)], histogram=TRUE, pch=19)
chart.Correlation(numeric_predictors[,c(my_list$'3',20)], histogram=TRUE, pch=19)
my_fit <- lm(SalePrice~.,data= numeric_predictors)
vif(my_fit)
## Id LotFrontage LotArea YearBuilt YearRemodAdd
## 1.015087 1.574285 1.314655 4.050863 2.059486
## MasVnrArea BsmtUnfSF TotalBsmtSF X1stFlrSF GrLivArea
## 1.405308 1.354039 4.424999 4.258494 5.015906
## FullBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd GarageYrBlt
## 2.470655 2.072899 1.275040 4.429492 4.327836
## GarageCars GarageArea MoSold YrSold
## 4.093638 4.292926 1.034259 1.035333
numeric_predictors_2 <- subset( numeric_predictors,
select = - c(BsmtUnfSF,X1stFlrSF,GarageCars,GarageYrBlt,LotFrontage,TotRmsAbvGrd,FullBath,BedroomAbvGr,KitchenAbvGr))
res <- cor(numeric_predictors_2[,2:11],use= "complete.obs")
corrplot(res)
chart.Correlation(numeric_predictors_2[,2:11], histogram=TRUE, pch=19)
my_fit <- lm(SalePrice~.,data= numeric_predictors_2)
vif(my_fit)
## Id LotArea YearBuilt YearRemodAdd MasVnrArea
## 1.007435 1.123161 1.923149 1.642460 1.331410
## TotalBsmtSF GrLivArea GarageArea MoSold YrSold
## 1.581703 1.598778 1.719565 1.025619 1.027127
df2 <- subset(df2, select = -c(Id,FireplaceQu))
corrplot(cor(r_skew_df,method = 'pearson', use = 'complete.obs'))
chart.Correlation(r_skew_df[,c(1,2,3,4,5,6,12)], histogram=TRUE, pch=19)
chart.Correlation(r_skew_df[,c(6,7,8,9,10,11,12)], histogram=TRUE, pch=19)
logr_skew_df <- log(r_skew_df+.0001)
corrplot(cor(logr_skew_df,method = 'pearson', use = 'complete.obs'))
chart.Correlation(logr_skew_df[,c(1,2,3,4,5,6,12)], histogram=TRUE, pch=19)
chart.Correlation(logr_skew_df[,c(6,7,8,9,10,11,12)], histogram=TRUE, pch=19)
temp_df <- r_skew_df
temp_df[temp_df == 0] <- NA
table(is.na(temp_df))
##
## FALSE TRUE
## 5187 12333
names_1 <- colnames(temp_df)
lapply(names_1,function(x){ggplot(temp_df,
aes_string(x = x, y = 'SalePrice')) +
geom_point() })
## [[1]]
## Warning: Removed 467 rows containing missing values (geom_point).
##
## [[2]]
## Warning: Removed 1293 rows containing missing values (geom_point).
##
## [[3]]
## Warning: Removed 829 rows containing missing values (geom_point).
##
## [[4]]
## Warning: Removed 1434 rows containing missing values (geom_point).
##
## [[5]]
## Warning: Removed 761 rows containing missing values (geom_point).
##
## [[6]]
## Warning: Removed 656 rows containing missing values (geom_point).
##
## [[7]]
## Warning: Removed 1252 rows containing missing values (geom_point).
##
## [[8]]
## Warning: Removed 1436 rows containing missing values (geom_point).
##
## [[9]]
## Warning: Removed 1344 rows containing missing values (geom_point).
##
## [[10]]
## Warning: Removed 1453 rows containing missing values (geom_point).
##
## [[11]]
## Warning: Removed 1408 rows containing missing values (geom_point).
##
## [[12]]
temp_df <- subset( temp_df, select = -c(BsmtFinSF2,LowQualFinSF,
X3SsnPorch,EnclosedPorch,PoolArea,MiscVal,ScreenPorch) )
chart.Correlation(temp_df, histogram=TRUE, pch=19)
temp_df <- temp_df %>%
filter(BsmtFinSF1<5600)
summary(temp_df$BsmtFinSF1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.0 370.8 604.0 647.2 866.2 2260.0
r_skew_df_2 <- subset( r_skew_df, select = -c(BsmtFinSF2,LowQualFinSF,
X3SsnPorch,EnclosedPorch,PoolArea,MiscVal,ScreenPorch) )
chart.Correlation(r_skew_df_2, histogram=TRUE, pch=19)
my_fit <- lm(SalePrice~.,data= numeric_predictors_2)
r_skew_df_2 <- r_skew_df_2[,1:2]
post_analysis_df <- bind_cols(r_skew_df_2,numeric_predictors_2)
corrplot(cor(logr_skew_df,method = 'pearson', use = 'complete.obs'))
chart.Correlation(post_analysis_df[,c(1,2,4,5,6,7,8,9,10,13)], histogram=TRUE, pch=19)
my_fit <- lm(SalePrice~.,data= post_analysis_df)
vif(my_fit)
## BsmtFinSF1 X2ndFlrSF Id LotArea YearBuilt
## 1.420428 4.553223 1.007970 1.141775 1.953314
## YearRemodAdd MasVnrArea TotalBsmtSF GrLivArea GarageArea
## 1.648109 1.344100 3.575395 6.003085 1.738801
## MoSold YrSold
## 1.026079 1.028034
my_correlation_mat <- cor(numeric_predictors[,c(2,17,20)],method = 'pearson', use = 'complete.obs')
kable(my_correlation_mat,caption=" Correlation matrix" )
LotFrontage | GarageArea | SalePrice | |
---|---|---|---|
LotFrontage | 1.0000000 | 0.3449967 | 0.3517991 |
GarageArea | 0.3449967 | 1.0000000 | 0.6317615 |
SalePrice | 0.3517991 | 0.6317615 | 1.0000000 |
cor.test(numeric_predictors$LotFrontage,
numeric_predictors$SalePrice,
method= "pearson",
conf.level=.80)
##
## Pearson's product-moment correlation
##
## data: numeric_predictors$LotFrontage and numeric_predictors$SalePrice
## t = 13.013, df = 1199, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.3189425 0.3838111
## sample estimates:
## cor
## 0.3517991
cor.test(numeric_predictors$GarageArea,
numeric_predictors$SalePrice,
method= "pearson",
conf.level=.80)
##
## Pearson's product-moment correlation
##
## data: numeric_predictors$GarageArea and numeric_predictors$SalePrice
## t = 30.446, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.6024756 0.6435283
## sample estimates:
## cor
## 0.6234314
cor.test(numeric_predictors$GarageArea,
numeric_predictors$LotFrontage,
method= "pearson",
conf.level=.80)
##
## Pearson's product-moment correlation
##
## data: numeric_predictors$GarageArea and numeric_predictors$LotFrontage
## t = 12.727, df = 1199, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.3119708 0.3771899
## sample estimates:
## cor
## 0.3449967
cor.test(numeric_predictors$MoSold,
numeric_predictors$GarageArea,
method= "pearson",
conf.level=.80)
##
## Pearson's product-moment correlation
##
## data: numeric_predictors$MoSold and numeric_predictors$GarageArea
## t = 1.0686, df = 1458, p-value = 0.2854
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## -0.005593091 0.061477721
## sample estimates:
## cor
## 0.0279738
my_correlation_mat
## LotFrontage GarageArea SalePrice
## LotFrontage 1.0000000 0.3449967 0.3517991
## GarageArea 0.3449967 1.0000000 0.6317615
## SalePrice 0.3517991 0.6317615 1.0000000
prec_matrix <- solve(my_correlation_mat)
prec_matrix
## LotFrontage GarageArea SalePrice
## LotFrontage 1.1748616 -0.2399936 -0.2616965
## GarageArea -0.2399936 1.7132574 -0.9979405
## SalePrice -0.2616965 -0.9979405 1.7225250
my_correlation_mat%*%prec_matrix
## LotFrontage GarageArea SalePrice
## LotFrontage 1.000000e+00 0 0.000000e+00
## GarageArea -2.775558e-17 1 -2.220446e-16
## SalePrice -5.551115e-17 0 1.000000e+00
prec_matrix%*%my_correlation_mat
## LotFrontage GarageArea SalePrice
## LotFrontage 1 -5.551115e-17 0.000000e+00
## GarageArea 0 1.000000e+00 -1.110223e-16
## SalePrice 0 0.000000e+00 1.000000e+00
lu.decomposition(my_correlation_mat)
## $L
## [,1] [,2] [,3]
## [1,] 1.0000000 0.0000000 0
## [2,] 0.3449967 1.0000000 0
## [3,] 0.3517991 0.5793475 1
##
## $U
## [,1] [,2] [,3]
## [1,] 1 0.3449967 0.3517991
## [2,] 0 0.8809773 0.5103920
## [3,] 0 0.0000000 0.5805431
my_fit <- lm(SalePrice~.,data= numeric_predictors)
my_fit_2 <- update(my_fit,.~.-GrLivArea)
my_fit_2 <- update(my_fit_2,.~.-TotalBsmtSF)
vif(my_fit)
## Id LotFrontage LotArea YearBuilt YearRemodAdd
## 1.015087 1.574285 1.314655 4.050863 2.059486
## MasVnrArea BsmtUnfSF TotalBsmtSF X1stFlrSF GrLivArea
## 1.405308 1.354039 4.424999 4.258494 5.015906
## FullBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd GarageYrBlt
## 2.470655 2.072899 1.275040 4.429492 4.327836
## GarageCars GarageArea MoSold YrSold
## 4.093638 4.292926 1.034259 1.035333
vif(my_fit_2)
## Id LotFrontage LotArea YearBuilt YearRemodAdd
## 1.014369 1.573987 1.293743 3.783964 2.027873
## MasVnrArea BsmtUnfSF X1stFlrSF FullBath BedroomAbvGr
## 1.344090 1.204705 1.914545 2.193704 2.062311
## KitchenAbvGr TotRmsAbvGrd GarageYrBlt GarageCars GarageArea
## 1.193188 2.809577 4.314251 4.078585 4.205734
## MoSold YrSold
## 1.032221 1.035018
fitted_dist2 <- fitdistr(r_skew_df$OpenPorchSF,densfun ="negative binomial")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
fitted_dist <- fitdistr(r_skew_df$OpenPorchSF,densfun ="exponential")
lambda <- fitted_dist$estimate
fitted_dist$estimate
## rate
## 0.02143151
par(mfrow=c(2,2))
hist(rexp(1000,lambda),breaks=40)
hist(rnegbin(1000,theta=fitted_dist2$estimate[1]),breaks=40)
hist(r_skew_df$OpenPorchSF, breaks=40)
suppressWarnings(broom::glance(MASS::fitdistr(r_skew_df$OpenPorchSF,"normal")))
## n logLik AIC BIC
## 1 1460 -8193.699 16391.4 16401.97
suppressWarnings(broom::glance(MASS::fitdistr(r_skew_df$OpenPorchSF,"exponential")))
## n logLik AIC BIC
## 1 1460 -7070.624 14143.25 14148.53
suppressWarnings(broom::glance(MASS::fitdistr(r_skew_df$OpenPorchSF,"negative binomial")))
## n logLik AIC BIC
## 1 1460 -5790.731 11585.46 11596.03
\[CDF_{exp}(1 ??? e)^{\lambda x}\]
lower_sim <-log(1-0.05)/-lambda
upper_sim <-log(1-0.95)/-lambda
actual_lower <- quantile(r_skew_df$OpenPorchSF,.05)
actual_upper <- quantile(r_skew_df$OpenPorchSF,.95)
comparison <- as.data.frame(rbind(c(lower_sim,upper_sim),c(actual_lower,actual_upper) ) )
row.names(comparison) <- c("sim","actual")
colnames(comparison) <- c("lower","upper")
kable(comparison)
lower | upper | |
---|---|---|
sim | 2.393359 | 139.7817 |
actual | 0.000000 | 175.0500 |
a <- mean(r_skew_df$OpenPorchSF)
n <- as.numeric(length(r_skew_df$OpenPorchSF))
mysd <- sd(r_skew_df$OpenPorchSF, na.rm = FALSE)
error <- qnorm(0.975)*mysd/sqrt(n)
my_lower <- a-error
my_upper <- a+error
my_lower
## [1] 43.2617
my_upper
## [1] 50.05885
chooseOne = function(question){
factors_df %>%
filter(!UQ(sym(question)) == "") %>%
dplyr::group_by_(question) %>%
dplyr::summarise(count = n()) %>%
dplyr::mutate(percent = (count / sum(count)) * 100) %>%
dplyr::arrange(desc(count))
}
my_names <- colnames(factors_df)
the_names <- my_names[(c(2, 3, 10, 11, 12, 15, 19, 20, 25))]
lapply(my_names[1:5], function(x) chooseOne(x))
## [[1]]
## # A tibble: 15 x 3
## MSSubClass count percent
## <fct> <int> <dbl>
## 1 20 536 36.7
## 2 60 299 20.5
## 3 50 144 9.86
## 4 120 87 5.96
## 5 30 69 4.73
## 6 160 63 4.32
## 7 70 60 4.11
## 8 80 58 3.97
## 9 90 52 3.56
## 10 190 30 2.05
## 11 85 20 1.37
## 12 75 16 1.10
## 13 45 12 0.822
## 14 180 10 0.685
## 15 40 4 0.274
##
## [[2]]
## # A tibble: 5 x 3
## MSZoning count percent
## <fct> <int> <dbl>
## 1 RL 1151 78.8
## 2 RM 218 14.9
## 3 FV 65 4.45
## 4 RH 16 1.10
## 5 C (all) 10 0.685
##
## [[3]]
## # A tibble: 4 x 3
## LotShape count percent
## <fct> <int> <dbl>
## 1 Reg 925 63.4
## 2 IR1 484 33.2
## 3 IR2 41 2.81
## 4 IR3 10 0.685
##
## [[4]]
## # A tibble: 4 x 3
## LandContour count percent
## <fct> <int> <dbl>
## 1 Lvl 1311 89.8
## 2 Bnk 63 4.32
## 3 HLS 50 3.42
## 4 Low 36 2.47
##
## [[5]]
## # A tibble: 5 x 3
## LotConfig count percent
## <fct> <int> <dbl>
## 1 Inside 1052 72.1
## 2 Corner 263 18.0
## 3 CulDSac 94 6.44
## 4 FR2 47 3.22
## 5 FR3 4 0.274
par(mfrow=c(2,2))
for(x in 1:40){
plot(factors_df[,x],factors_df$SalePrice,main=my_names[x])
}
factors_df <- subset(factors_df, select = -c(LandSlope,Exterior2nd,BldgType,FireplaceQu,RoofStyle,RoofMatl,SaleType,OverallCond,
ExterQual,GarageQual,Electrical,BsmtCond,ExterCond, HeatingQC, BsmtExposure,BsmtFinType1,BsmtFinType2,Functional,GarageCond,Condition2,Exterior1st))
#impute missing values, hand picked
post_analysis_2 <- bind_cols(post_analysis_df,factors_df)
post_analysis_2 <- post_analysis_2[,-13]
mice_plot <- aggr(post_analysis_2, col=c('navyblue','yellow'),
numbers=TRUE, sortVars=TRUE,
labels=names(df2), cex.axis=.7,
gap=3, ylab=c("Missing data","Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## BsmtQual 0.055479452
## BsmtCond 0.055479452
## ExterQual 0.025342466
## LotConfig 0.005479452
## MasVnrType 0.005479452
## MSSubClass 0.000000000
## MSZoning 0.000000000
## LotFrontage 0.000000000
## LotArea 0.000000000
## LotShape 0.000000000
## LandContour 0.000000000
## LandSlope 0.000000000
## Neighborhood 0.000000000
## Condition1 0.000000000
## Condition2 0.000000000
## BldgType 0.000000000
## HouseStyle 0.000000000
## OverallQual 0.000000000
## OverallCond 0.000000000
## YearBuilt 0.000000000
## YearRemodAdd 0.000000000
## RoofStyle 0.000000000
## RoofMatl 0.000000000
## Exterior1st 0.000000000
## Exterior2nd 0.000000000
## MasVnrArea 0.000000000
## ExterCond 0.000000000
## Foundation 0.000000000
## BsmtExposure 0.000000000
## BsmtFinType1 0.000000000
## BsmtFinSF1 0.000000000
#df4 <- missForest(post_analysis_2)
#hand_selected__df<- df4$ximp
#save(hand_selected__df,file="handselected.Rda")
## Impute full df
#df4 <- missForest(post_analysis_2)
#whole_df<- df4$ximp
#save(hand_selected__df,file="handselected.Rda")
#save(df_22,file="wholedf.Rda")
# df_model <- subset(df_44, select = -c(Id,YearRemodAdd))
# #df_model
library(MASS)
# model 1 on selected variables
set.seed(123)
smp_size <- floor(0.75 * nrow(hand_selected__df))
train_ind <- sample(seq_len(nrow(hand_selected__df)), size = smp_size)
train <- hand_selected__df[train_ind, ]
test <- hand_selected__df[-train_ind, ]
mod1 <- lm(data=hand_selected__df,SalePrice1~.)
predictions <- predict(mod1, test)
model_1_rmse <- rmse(log(test$SalePrice1), log(predictions)) ## .3429
vif(mod1)
## GVIF Df GVIF^(1/(2*Df))
## BsmtFinSF1 1.812840e+00 1 1.346417
## X2ndFlrSF 1.766844e+01 1 4.203384
## Id 1.092547e+00 1 1.045250
## LotArea 1.795199e+00 1 1.339851
## YearBuilt 1.304753e+01 1 3.612137
## YearRemodAdd 2.531173e+00 1 1.590966
## MasVnrArea 2.705630e+00 1 1.644880
## TotalBsmtSF 6.209359e+00 1 2.491858
## GrLivArea 1.067143e+01 1 3.266715
## GarageArea 2.332788e+00 1 1.527347
## MoSold 1.126890e+00 1 1.061551
## YrSold 1.146123e+00 1 1.070571
## MSSubClass 2.417886e+07 14 1.835245
## MSZoning 4.215254e+01 4 1.596258
## LotShape 2.131249e+00 3 1.134416
## LandContour 2.635871e+00 3 1.175314
## LotConfig 1.837355e+00 4 1.079007
## Neighborhood 3.860187e+04 24 1.246104
## Condition1 3.156546e+00 8 1.074486
## HouseStyle 7.766806e+05 7 2.634703
## OverallQual 3.235302e+01 9 1.213065
## MasVnrType 4.434212e+00 3 1.281748
## Foundation 1.406771e+01 5 1.302634
## BsmtQual 9.595879e+00 3 1.457742
## CentralAir 1.697293e+00 1 1.302802
## KitchenQual 6.369658e+00 3 1.361505
## GarageType 7.935278e+00 5 1.230145
## GarageFinish 3.314388e+00 2 1.349276
## PavedDrive 1.782207e+00 2 1.155419
## SaleCondition 2.914113e+00 5 1.112886
summary(mod1)
##
## Call:
## lm(formula = SalePrice1 ~ ., data = hand_selected__df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -400357 -12019 -302 11069 202275
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.034e+05 1.254e+06 0.481 0.630478
## BsmtFinSF1 1.450e+01 2.266e+00 6.400 2.14e-10 ***
## X2ndFlrSF 2.683e+01 7.390e+00 3.630 0.000294 ***
## Id -8.680e-01 1.903e+00 -0.456 0.648326
## LotArea 4.951e-01 1.030e-01 4.806 1.71e-06 ***
## YearBuilt 1.048e+02 9.179e+01 1.142 0.253694
## YearRemodAdd 1.799e+02 5.914e+01 3.041 0.002403 **
## MasVnrArea 1.180e+01 6.976e+00 1.692 0.090904 .
## TotalBsmtSF 1.193e+00 4.359e+00 0.274 0.784310
## GrLivArea 4.365e+01 4.771e+00 9.149 < 2e-16 ***
## GarageArea 2.091e+01 5.483e+00 3.813 0.000143 ***
## MoSold -4.458e+02 3.014e+02 -1.479 0.139295
## YrSold -5.893e+02 6.187e+02 -0.953 0.341007
## MSSubClass30 -7.113e+03 5.525e+03 -1.287 0.198193
## MSSubClass40 -5.683e+03 1.529e+04 -0.372 0.710280
## MSSubClass45 -1.337e+04 2.397e+04 -0.558 0.577231
## MSSubClass50 4.236e+03 1.018e+04 0.416 0.677300
## MSSubClass60 3.047e+03 8.846e+03 0.344 0.730578
## MSSubClass70 4.956e+03 9.517e+03 0.521 0.602590
## MSSubClass75 4.672e+02 1.729e+04 0.027 0.978447
## MSSubClass80 -1.195e+02 1.393e+04 -0.009 0.993161
## MSSubClass85 -7.587e+03 1.248e+04 -0.608 0.543266
## MSSubClass90 -1.234e+04 6.036e+03 -2.044 0.041125 *
## MSSubClass120 -2.591e+04 4.641e+03 -5.583 2.85e-08 ***
## MSSubClass160 -2.373e+04 1.058e+04 -2.243 0.025090 *
## MSSubClass180 -1.757e+04 1.536e+04 -1.144 0.252762
## MSSubClass190 -5.386e+03 8.625e+03 -0.624 0.532418
## MSZoningFV 3.499e+04 1.471e+04 2.378 0.017545 *
## MSZoningRH 2.783e+04 1.470e+04 1.893 0.058560 .
## MSZoningRL 2.978e+04 1.232e+04 2.417 0.015762 *
## MSZoningRM 3.117e+04 1.152e+04 2.707 0.006885 **
## LotShapeIR2 1.251e+04 5.152e+03 2.429 0.015278 *
## LotShapeIR3 -3.518e+04 1.037e+04 -3.393 0.000712 ***
## LotShapeReg 3.956e+02 1.973e+03 0.201 0.841095
## LandContourHLS 2.469e+04 6.228e+03 3.965 7.73e-05 ***
## LandContourLow 2.147e+04 7.371e+03 2.913 0.003637 **
## LandContourLvl 1.694e+04 4.309e+03 3.931 8.90e-05 ***
## LotConfigCulDSac 1.062e+04 3.899e+03 2.725 0.006521 **
## LotConfigFR2 -9.687e+03 4.918e+03 -1.970 0.049082 *
## LotConfigFR3 -1.770e+04 1.565e+04 -1.131 0.258336
## LotConfigInside 6.962e+02 2.141e+03 0.325 0.745132
## NeighborhoodBlueste -9.775e+03 2.398e+04 -0.408 0.683646
## NeighborhoodBrDale -1.309e+04 1.373e+04 -0.954 0.340448
## NeighborhoodBrkSide -1.612e+04 1.121e+04 -1.437 0.150845
## NeighborhoodClearCr -1.023e+04 1.089e+04 -0.939 0.347643
## NeighborhoodCollgCr -1.649e+04 8.943e+03 -1.844 0.065341 .
## NeighborhoodCrawfor 9.182e+03 1.034e+04 0.888 0.374767
## NeighborhoodEdwards -3.739e+04 9.688e+03 -3.859 0.000119 ***
## NeighborhoodGilbert -1.668e+04 9.507e+03 -1.754 0.079653 .
## NeighborhoodIDOTRR -2.540e+04 1.276e+04 -1.991 0.046718 *
## NeighborhoodMeadowV -2.841e+04 1.396e+04 -2.035 0.042022 *
## NeighborhoodMitchel -2.661e+04 9.958e+03 -2.672 0.007637 **
## NeighborhoodNAmes -2.332e+04 9.456e+03 -2.466 0.013775 *
## NeighborhoodNoRidge 3.080e+04 1.031e+04 2.988 0.002857 **
## NeighborhoodNPkVill 2.387e+03 1.360e+04 0.176 0.860633
## NeighborhoodNridgHt 1.467e+04 9.276e+03 1.582 0.113901
## NeighborhoodNWAmes -1.983e+04 9.680e+03 -2.048 0.040711 *
## NeighborhoodOldTown -3.235e+04 1.145e+04 -2.826 0.004786 **
## NeighborhoodSawyer -2.330e+04 9.878e+03 -2.359 0.018458 *
## NeighborhoodSawyerW -1.257e+04 9.512e+03 -1.321 0.186646
## NeighborhoodSomerst -7.414e+03 1.123e+04 -0.660 0.509197
## NeighborhoodStoneBr 2.812e+04 1.026e+04 2.742 0.006196 **
## NeighborhoodSWISU -1.990e+04 1.177e+04 -1.691 0.091144 .
## NeighborhoodTimber -1.456e+04 1.010e+04 -1.442 0.149501
## NeighborhoodVeenker 1.617e+04 1.254e+04 1.290 0.197409
## Condition1Feedr -7.865e+03 5.856e+03 -1.343 0.179517
## Condition1Norm 5.910e+03 4.854e+03 1.218 0.223621
## Condition1PosA 1.109e+04 1.188e+04 0.933 0.351005
## Condition1PosN -8.980e+03 8.558e+03 -1.049 0.294220
## Condition1RRAe -1.897e+04 1.073e+04 -1.769 0.077190 .
## Condition1RRAn 5.772e+03 7.914e+03 0.729 0.465924
## Condition1RRNe -6.435e+03 2.192e+04 -0.294 0.769097
## Condition1RRNn 5.735e+03 1.515e+04 0.379 0.705115
## HouseStyle1.5Unf 3.097e+04 2.348e+04 1.319 0.187387
## HouseStyle1Story 2.378e+04 1.013e+04 2.346 0.019118 *
## HouseStyle2.5Fin -9.881e+03 1.822e+04 -0.542 0.587597
## HouseStyle2.5Unf -2.873e+03 1.783e+04 -0.161 0.871974
## HouseStyle2Story -9.497e+03 9.473e+03 -1.003 0.316269
## HouseStyleSFoyer 2.532e+04 1.342e+04 1.887 0.059377 .
## HouseStyleSLvl 1.875e+04 1.532e+04 1.224 0.221277
## OverallQual2 1.088e+04 2.864e+04 0.380 0.704099
## OverallQual3 1.747e+04 2.366e+04 0.739 0.460323
## OverallQual4 2.173e+04 2.307e+04 0.942 0.346348
## OverallQual5 2.396e+04 2.320e+04 1.033 0.302014
## OverallQual6 3.351e+04 2.326e+04 1.441 0.149910
## OverallQual7 4.296e+04 2.338e+04 1.837 0.066400 .
## OverallQual8 6.458e+04 2.366e+04 2.729 0.006427 **
## OverallQual9 1.086e+05 2.440e+04 4.451 9.24e-06 ***
## OverallQual10 1.250e+05 2.534e+04 4.932 9.16e-07 ***
## MasVnrTypeBrkFace 1.262e+04 8.198e+03 1.540 0.123809
## MasVnrTypeNone 1.506e+04 8.235e+03 1.829 0.067567 .
## MasVnrTypeStone 1.586e+04 8.713e+03 1.821 0.068857 .
## FoundationCBlock 4.715e+03 3.825e+03 1.233 0.217824
## FoundationPConc 5.202e+03 4.202e+03 1.238 0.215930
## FoundationSlab -9.702e+03 8.699e+03 -1.115 0.264934
## FoundationStone 8.753e+03 1.280e+04 0.684 0.494312
## FoundationWood -2.098e+04 1.827e+04 -1.148 0.251159
## BsmtQualFa -1.897e+04 7.537e+03 -2.517 0.011953 *
## BsmtQualGd -1.872e+04 4.169e+03 -4.491 7.71e-06 ***
## BsmtQualTA -2.012e+04 5.063e+03 -3.974 7.43e-05 ***
## CentralAirY 7.372e+03 4.053e+03 1.819 0.069130 .
## KitchenQualFa -2.739e+04 7.293e+03 -3.756 0.000180 ***
## KitchenQualGd -1.869e+04 4.304e+03 -4.343 1.51e-05 ***
## KitchenQualTA -2.505e+04 4.774e+03 -5.247 1.79e-07 ***
## GarageTypeAttchd 2.488e+04 1.306e+04 1.905 0.057040 .
## GarageTypeBasment 2.911e+04 1.517e+04 1.919 0.055181 .
## GarageTypeBuiltIn 2.200e+04 1.368e+04 1.608 0.108119
## GarageTypeCarPort 1.830e+04 1.657e+04 1.105 0.269430
## GarageTypeDetchd 2.452e+04 1.302e+04 1.883 0.059853 .
## GarageFinishRFn -1.808e+03 2.440e+03 -0.741 0.458841
## GarageFinishUnf -4.266e+03 2.902e+03 -1.470 0.141766
## PavedDriveP -3.748e+03 6.602e+03 -0.568 0.570291
## PavedDriveY -8.789e+02 3.967e+03 -0.222 0.824698
## SaleConditionAdjLand 2.033e+04 1.605e+04 1.266 0.205639
## SaleConditionAlloca 1.731e+04 1.019e+04 1.699 0.089607 .
## SaleConditionFamily 6.068e+02 7.434e+03 0.082 0.934958
## SaleConditionNormal 9.610e+03 3.218e+03 2.986 0.002877 **
## SaleConditionPartial 1.833e+04 4.646e+03 3.944 8.43e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 29320 on 1342 degrees of freedom
## Multiple R-squared: 0.8747, Adjusted R-squared: 0.8638
## F-statistic: 80.1 on 117 and 1342 DF, p-value: < 2.2e-16
# Model 2 backwards select on selected variables
fit2 <- lm(data=hand_selected__df,SalePrice1~.)
step.model <- stepAIC(fit2, direction = "both",
trace = FALSE)
summary(step.model)
##
## Call:
## lm(formula = SalePrice1 ~ BsmtFinSF1 + X2ndFlrSF + LotArea +
## YearBuilt + YearRemodAdd + MasVnrArea + GrLivArea + GarageArea +
## MoSold + MSSubClass + LotShape + LandContour + LotConfig +
## Neighborhood + Condition1 + OverallQual + BsmtQual + CentralAir +
## KitchenQual + SaleCondition, data = hand_selected__df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -406923 -11950 -196 11290 210168
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.700e+05 1.917e+05 -3.494 0.000490 ***
## BsmtFinSF1 1.605e+01 2.114e+00 7.595 5.68e-14 ***
## X2ndFlrSF 1.425e+01 5.424e+00 2.627 0.008717 **
## LotArea 5.047e-01 1.016e-01 4.968 7.63e-07 ***
## YearBuilt 1.960e+02 8.149e+01 2.405 0.016292 *
## YearRemodAdd 1.761e+02 5.764e+01 3.055 0.002290 **
## MasVnrArea 8.251e+00 5.520e+00 1.495 0.135213
## GrLivArea 4.563e+01 3.398e+00 13.429 < 2e-16 ***
## GarageArea 1.788e+01 5.196e+00 3.440 0.000598 ***
## MoSold -4.747e+02 2.964e+02 -1.602 0.109484
## MSSubClass30 -4.173e+03 5.267e+03 -0.792 0.428301
## MSSubClass40 -4.799e+03 1.516e+04 -0.317 0.751659
## MSSubClass45 -3.022e+03 9.367e+03 -0.323 0.747018
## MSSubClass50 -1.306e+04 4.611e+03 -2.833 0.004684 **
## MSSubClass60 -2.016e+04 4.856e+03 -4.152 3.50e-05 ***
## MSSubClass70 -1.693e+04 6.702e+03 -2.526 0.011632 *
## MSSubClass75 -1.504e+04 9.693e+03 -1.552 0.120874
## MSSubClass80 -2.873e+03 4.304e+03 -0.667 0.504657
## MSSubClass85 -5.017e+03 6.972e+03 -0.720 0.471903
## MSSubClass90 -2.056e+04 4.953e+03 -4.151 3.52e-05 ***
## MSSubClass120 -2.512e+04 4.336e+03 -5.793 8.58e-09 ***
## MSSubClass160 -4.924e+04 6.772e+03 -7.271 5.97e-13 ***
## MSSubClass180 -1.649e+04 1.184e+04 -1.393 0.163895
## MSSubClass190 -1.931e+04 6.948e+03 -2.779 0.005526 **
## LotShapeIR2 1.175e+04 5.086e+03 2.310 0.021028 *
## LotShapeIR3 -3.525e+04 1.021e+04 -3.451 0.000575 ***
## LotShapeReg 3.809e+01 1.951e+03 0.020 0.984429
## LandContourHLS 2.371e+04 6.100e+03 3.887 0.000106 ***
## LandContourLow 1.937e+04 7.250e+03 2.672 0.007623 **
## LandContourLvl 1.726e+04 4.208e+03 4.101 4.35e-05 ***
## LotConfigCulDSac 9.348e+03 3.860e+03 2.422 0.015571 *
## LotConfigFR2 -1.115e+04 4.886e+03 -2.282 0.022626 *
## LotConfigFR3 -1.862e+04 1.554e+04 -1.198 0.230970
## LotConfigInside -7.418e+00 2.120e+03 -0.003 0.997208
## NeighborhoodBlueste -1.600e+03 2.313e+04 -0.069 0.944871
## NeighborhoodBrDale -1.211e+04 1.263e+04 -0.959 0.337796
## NeighborhoodBrkSide -1.339e+04 1.018e+04 -1.315 0.188719
## NeighborhoodClearCr -5.837e+03 1.058e+04 -0.552 0.581375
## NeighborhoodCollgCr -1.551e+04 8.546e+03 -1.815 0.069694 .
## NeighborhoodCrawfor 1.147e+04 1.003e+04 1.144 0.253026
## NeighborhoodEdwards -3.479e+04 9.271e+03 -3.752 0.000183 ***
## NeighborhoodGilbert -1.492e+04 9.212e+03 -1.620 0.105558
## NeighborhoodIDOTRR -2.794e+04 1.063e+04 -2.629 0.008667 **
## NeighborhoodMeadowV -2.293e+04 1.255e+04 -1.827 0.067842 .
## NeighborhoodMitchel -2.555e+04 9.563e+03 -2.672 0.007638 **
## NeighborhoodNAmes -2.068e+04 9.091e+03 -2.275 0.023079 *
## NeighborhoodNoRidge 3.382e+04 9.940e+03 3.403 0.000686 ***
## NeighborhoodNPkVill 4.068e+03 1.309e+04 0.311 0.755999
## NeighborhoodNridgHt 1.546e+04 8.815e+03 1.754 0.079607 .
## NeighborhoodNWAmes -1.690e+04 9.331e+03 -1.811 0.070356 .
## NeighborhoodOldTown -2.909e+04 9.769e+03 -2.978 0.002951 **
## NeighborhoodSawyer -2.112e+04 9.525e+03 -2.217 0.026796 *
## NeighborhoodSawyerW -1.233e+04 9.158e+03 -1.346 0.178468
## NeighborhoodSomerst -2.091e+03 8.771e+03 -0.238 0.811595
## NeighborhoodStoneBr 3.026e+04 9.969e+03 3.036 0.002444 **
## NeighborhoodSWISU -1.930e+04 1.126e+04 -1.714 0.086671 .
## NeighborhoodTimber -1.457e+04 9.866e+03 -1.477 0.139932
## NeighborhoodVeenker 2.001e+04 1.222e+04 1.638 0.101572
## Condition1Feedr -9.173e+03 5.768e+03 -1.590 0.111953
## Condition1Norm 5.633e+03 4.770e+03 1.181 0.237795
## Condition1PosA 1.085e+04 1.184e+04 0.917 0.359563
## Condition1PosN -9.299e+03 8.505e+03 -1.093 0.274418
## Condition1RRAe -1.935e+04 1.054e+04 -1.835 0.066741 .
## Condition1RRAn 5.164e+03 7.789e+03 0.663 0.507489
## Condition1RRNe -7.446e+03 2.189e+04 -0.340 0.733781
## Condition1RRNn 5.585e+03 1.477e+04 0.378 0.705426
## OverallQual2 1.052e+04 2.807e+04 0.375 0.707785
## OverallQual3 1.897e+04 2.342e+04 0.810 0.418079
## OverallQual4 2.572e+04 2.280e+04 1.128 0.259390
## OverallQual5 2.999e+04 2.288e+04 1.311 0.190050
## OverallQual6 3.969e+04 2.293e+04 1.731 0.083721 .
## OverallQual7 4.998e+04 2.307e+04 2.166 0.030454 *
## OverallQual8 7.223e+04 2.335e+04 3.093 0.002018 **
## OverallQual9 1.167e+05 2.408e+04 4.848 1.39e-06 ***
## OverallQual10 1.346e+05 2.505e+04 5.372 9.12e-08 ***
## BsmtQualFa -1.940e+04 7.416e+03 -2.617 0.008974 **
## BsmtQualGd -1.976e+04 4.125e+03 -4.790 1.85e-06 ***
## BsmtQualTA -2.087e+04 4.924e+03 -4.238 2.41e-05 ***
## CentralAirY 8.858e+03 3.896e+03 2.274 0.023143 *
## KitchenQualFa -2.571e+04 7.213e+03 -3.565 0.000377 ***
## KitchenQualGd -1.730e+04 4.256e+03 -4.065 5.08e-05 ***
## KitchenQualTA -2.429e+04 4.709e+03 -5.159 2.85e-07 ***
## SaleConditionAdjLand 2.673e+04 1.576e+04 1.696 0.090084 .
## SaleConditionAlloca 1.674e+04 9.755e+03 1.716 0.086338 .
## SaleConditionFamily 2.329e+03 7.386e+03 0.315 0.752602
## SaleConditionNormal 1.076e+04 3.149e+03 3.416 0.000653 ***
## SaleConditionPartial 2.035e+04 4.553e+03 4.470 8.47e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 29410 on 1373 degrees of freedom
## Multiple R-squared: 0.871, Adjusted R-squared: 0.8629
## F-statistic: 107.8 on 86 and 1373 DF, p-value: < 2.2e-16
predictions <- predict(step.model, test)
model_2_rmse <- rmse(log(test$SalePrice), log(predictions))
# Model 3
smp_size <- floor(0.75 * nrow(full_df))
set.seed(123)
train_ind <- sample(seq_len(nrow(full_df)), size = smp_size)
train <- full_df[train_ind, ]
test <- full_df[-train_ind, ]
mod2 <- lm(data=full_df,SalePrice~.)
mod2_sum <- summary(mod2)
alias(mod2)
## Model :
## SalePrice ~ MSSubClass + MSZoning + LotFrontage + LotArea + LotShape +
## LandContour + LotConfig + LandSlope + Neighborhood + Condition1 +
## Condition2 + BldgType + HouseStyle + OverallQual + OverallCond +
## YearBuilt + YearRemodAdd + RoofStyle + RoofMatl + Exterior2nd +
## MasVnrType + MasVnrArea + ExterQual + ExterCond + Foundation +
## BsmtQual + BsmtCond + BsmtExposure + BsmtFinType1 + BsmtFinSF1 +
## BsmtFinType2 + BsmtFinSF2 + BsmtUnfSF + HeatingQC + CentralAir +
## Electrical + X1stFlrSF + X2ndFlrSF + LowQualFinSF + BsmtFullBath +
## BsmtHalfBath + FullBath + HalfBath + BedroomAbvGr + KitchenAbvGr +
## KitchenQual + TotRmsAbvGrd + Functional + Fireplaces + GarageType +
## GarageYrBlt + GarageFinish + GarageCars + GarageArea + GarageQual +
## GarageCond + PavedDrive + WoodDeckSF + OpenPorchSF + EnclosedPorch +
## X3SsnPorch + ScreenPorch + PoolArea + MiscVal + MoSold +
## YrSold + SaleType + SaleCondition
predictions <- predict(mod2, test)
model_3_rmse <- rmse(log(test$SalePrice), log(predictions))## .473676544
vif(mod2)
## GVIF Df GVIF^(1/(2*Df))
## MSSubClass 3.243034e+01 1 5.694765
## MSZoning 5.565414e+01 4 1.652674
## LotFrontage 3.363250e+00 1 1.833917
## LotArea 3.097547e+00 1 1.759985
## LotShape 3.042801e+00 3 1.203776
## LandContour 4.546508e+00 3 1.287102
## LotConfig 2.890435e+00 4 1.141880
## LandSlope 5.583604e+00 2 1.537194
## Neighborhood 8.476487e+05 24 1.328937
## Condition1 1.006241e+01 8 1.155231
## Condition2 2.787284e+01 7 1.268315
## BldgType 3.170863e+02 4 2.054221
## HouseStyle 5.846074e+02 7 1.576279
## OverallQual 5.313678e+00 1 2.305142
## OverallCond 2.600351e+00 1 1.612560
## YearBuilt 1.485690e+01 1 3.854466
## YearRemodAdd 3.571418e+00 1 1.889820
## RoofStyle 8.084095e+01 5 1.551541
## RoofMatl 9.484462e+01 7 1.384252
## Exterior2nd 2.391576e+02 15 1.200299
## MasVnrType 5.965833e+00 3 1.346724
## MasVnrArea 3.016140e+00 1 1.736704
## ExterQual 1.713939e+01 3 1.605705
## ExterCond 4.783348e+00 4 1.216092
## Foundation 2.792192e+01 5 1.395065
## BsmtQual 1.341061e+01 3 1.541374
## BsmtCond 6.163974e+00 3 1.354077
## BsmtExposure 3.554808e+00 3 1.235386
## BsmtFinType1 1.702031e+01 5 1.327690
## BsmtFinSF1 1.203975e+01 1 3.469835
## BsmtFinType2 1.099871e+01 5 1.270967
## BsmtFinSF2 5.211180e+00 1 2.282801
## BsmtUnfSF 8.062634e+00 1 2.839478
## HeatingQC 5.614177e+00 4 1.240683
## CentralAir 2.274110e+00 1 1.508015
## Electrical 1.298990e+01 4 1.377846
## X1stFlrSF 1.075863e+01 1 3.280035
## X2ndFlrSF 1.557348e+01 1 3.946325
## LowQualFinSF 2.160945e+00 1 1.470015
## BsmtFullBath 2.895168e+00 1 1.701519
## BsmtHalfBath 1.425557e+00 1 1.193967
## FullBath 4.016002e+00 1 2.003997
## HalfBath 3.034488e+00 1 1.741978
## BedroomAbvGr 3.385744e+00 1 1.840039
## KitchenAbvGr 4.016196e+00 1 2.004045
## KitchenQual 9.572783e+00 3 1.457157
## TotRmsAbvGrd 6.476568e+00 1 2.544910
## Functional 7.734544e+00 6 1.185868
## Fireplaces 2.043756e+00 1 1.429600
## GarageType 1.408216e+01 5 1.302768
## GarageYrBlt 6.605429e+00 1 2.570103
## GarageFinish 3.945972e+00 2 1.409414
## GarageCars 7.282711e+00 1 2.698650
## GarageArea 7.713437e+00 1 2.777308
## GarageQual 4.872002e+01 4 1.625412
## GarageCond 2.526991e+01 4 1.497357
## PavedDrive 2.336810e+00 2 1.236391
## WoodDeckSF 1.473304e+00 1 1.213797
## OpenPorchSF 1.580156e+00 1 1.257043
## EnclosedPorch 1.565488e+00 1 1.251194
## X3SsnPorch 1.204158e+00 1 1.097341
## ScreenPorch 1.288485e+00 1 1.135115
## PoolArea 1.498178e+00 1 1.224001
## MiscVal 1.383216e+00 1 1.176102
## MoSold 1.205865e+00 1 1.098119
## YrSold 1.289333e+00 1 1.135488
## SaleType 1.586747e+02 8 1.372561
## SaleCondition 1.702461e+02 5 1.671505
plot(mod2)
## Warning: not plotting observations with leverage one:
## 121, 186, 251, 272, 326, 376, 399, 584, 596, 667, 1004, 1231, 1271, 1276, 1299, 1371
## Warning: not plotting observations with leverage one:
## 121, 186, 251, 272, 326, 376, 399, 584, 596, 667, 1004, 1231, 1271, 1276, 1299, 1371
# Model 4 backwards select full
# seems slightly better and reduces inputs from 117-83
fit2 <- lm(data=full_df,SalePrice~.)
step.model1 <- stepAIC(fit2, direction = "both",
trace = FALSE)
predictions1 <- predict(step.model1, test)
model_4_rmse <- rmse(log(test$SalePrice), log(predictions1))#502544959
scores_df <- as.data.frame(rbind(model_1_rmse,model_2_rmse,model_3_rmse,model_4_rmse))
colnames(scores_df) <- c("log_rmse")
kable(scores_df)
log_rmse | |
---|---|
model_1_rmse | 0.1414235 |
model_2_rmse | 0.1428636 |
model_3_rmse | 0.1227174 |
model_4_rmse | 0.1304504 |
# Model 5 try elastic net with lasso on entire df
## taken from https://www4.stat.ncsu.edu/~post/josh/LASSO_Ridge_Elastic_Net_-_Examples.html
pkgs <- list("glmnet", "doParallel", "foreach", "pROC")
lapply(pkgs, require, character.only = T)
## Loading required package: doParallel
## Warning: package 'doParallel' was built under R version 3.4.4
## Loading required package: parallel
## Loading required package: pROC
## Warning: package 'pROC' was built under R version 3.4.4
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:glmnet':
##
## auc
## The following object is masked from 'package:Metrics':
##
## auc
## The following object is masked from 'package:colorspace':
##
## coords
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
## [[1]]
## [1] TRUE
##
## [[2]]
## [1] TRUE
##
## [[3]]
## [1] TRUE
##
## [[4]]
## [1] TRUE
registerDoParallel(cores = 4)
n <- nrow(full_df)
sample <- sample(seq(n), size = n * 0.7, replace = FALSE)
train <- full_df[sample,]
test <- full_df[-sample,]
mdlY <- as.matrix(train["SalePrice"])
#train <- subset(train,select = -c(SalePrice1))
mdlX <- model.matrix(~., data=train[,1:68])[,-1]
colnames(full_df)
## [1] "MSSubClass" "MSZoning" "LotFrontage" "LotArea"
## [5] "LotShape" "LandContour" "LotConfig" "LandSlope"
## [9] "Neighborhood" "Condition1" "Condition2" "BldgType"
## [13] "HouseStyle" "OverallQual" "OverallCond" "YearBuilt"
## [17] "YearRemodAdd" "RoofStyle" "RoofMatl" "Exterior2nd"
## [21] "MasVnrType" "MasVnrArea" "ExterQual" "ExterCond"
## [25] "Foundation" "BsmtQual" "BsmtCond" "BsmtExposure"
## [29] "BsmtFinType1" "BsmtFinSF1" "BsmtFinType2" "BsmtFinSF2"
## [33] "BsmtUnfSF" "HeatingQC" "CentralAir" "Electrical"
## [37] "X1stFlrSF" "X2ndFlrSF" "LowQualFinSF" "BsmtFullBath"
## [41] "BsmtHalfBath" "FullBath" "HalfBath" "BedroomAbvGr"
## [45] "KitchenAbvGr" "KitchenQual" "TotRmsAbvGrd" "Functional"
## [49] "Fireplaces" "GarageType" "GarageYrBlt" "GarageFinish"
## [53] "GarageCars" "GarageArea" "GarageQual" "GarageCond"
## [57] "PavedDrive" "WoodDeckSF" "OpenPorchSF" "EnclosedPorch"
## [61] "X3SsnPorch" "ScreenPorch" "PoolArea" "MiscVal"
## [65] "MoSold" "YrSold" "SaleType" "SaleCondition"
## [69] "SalePrice"
y.test <- as.matrix(test["SalePrice"])
x.test <- model.matrix(~., data=test[,1:68])[,-1]
fit.lasso <- glmnet(mdlX, mdlY, family="gaussian", alpha=1)
fit.ridge <- glmnet(mdlX, mdlY, family="gaussian", alpha=0)
fit.elnet <- glmnet(mdlX, mdlY, family="gaussian", alpha=.5)
for (i in 0:10) {
assign(paste("fit", i, sep=""), cv.glmnet(mdlX, mdlY, type.measure="mse",
alpha=i/10,family="gaussian"))
}
par(mfrow=c(3,2))
# For plotting options, type '?plot.glmnet' in R console
plot(fit.lasso, xvar="lambda")
plot(fit10, main="LASSO")
plot(fit.ridge, xvar="lambda")
plot(fit0, main="Ridge")
plot(fit.elnet, xvar="lambda")
plot(fit5, main="Elastic Net")
yhat0 <- predict(fit0, s=fit0$lambda.1se, newx=x.test)
yhat1 <- predict(fit1, s=fit1$lambda.1se, newx=x.test)
yhat2 <- predict(fit2, s=fit2$lambda.1se, newx=x.test)
yhat3 <- predict(fit3, s=fit3$lambda.1se, newx=x.test)
yhat4 <- predict(fit4, s=fit4$lambda.1se, newx=x.test)
yhat5 <- predict(fit5, s=fit5$lambda.1se, newx=x.test)
yhat6 <- predict(fit6, s=fit6$lambda.1se, newx=x.test)
yhat7 <- predict(fit7, s=fit7$lambda.1se, newx=x.test)
yhat8 <- predict(fit8, s=fit8$lambda.1se, newx=x.test)
yhat9 <- predict(fit9, s=fit9$lambda.1se, newx=x.test)
yhat10 <- predict(fit10, s=fit10$lambda.1se, newx=x.test)
mse0 <- sqrt(mean((log(y.test) - log(yhat0))^2))
mse1 <- sqrt(mean((log(y.test) - log(yhat1))^2))
mse2 <- sqrt(mean((log(y.test) - log(yhat2))^2))
mse3 <- sqrt(mean((log(y.test) - log(yhat3))^2))
mse4 <- sqrt(mean((log(y.test) - log(yhat4))^2))
mse5 <- sqrt(mean((log(y.test) - log(yhat5))^2))
mse6 <- sqrt(mean((log(y.test) - log(yhat6))^2))
mse7 <- sqrt(mean((log(y.test) - log(yhat7))^2))
mse8 <- sqrt(mean((log(y.test) - log(yhat8))^2))
mse9 <- sqrt(mean((log(y.test) - log(yhat9))^2))
mse10 <-sqrt( mean((log(y.test) - log(yhat10))^2))
as.data.frame(cbind(alpha= c(0,.1,.2,.3,.4,.5,.6,.7,.8,.9,1), MSE=c(mse0,mse1,mse2,mse3,mse4,mse5,mse6,mse7,mse8,mse9,mse10)))
## alpha MSE
## 1 0.0 0.1557263
## 2 0.1 0.1558410
## 3 0.2 0.1563490
## 4 0.3 0.1563748
## 5 0.4 0.1620576
## 6 0.5 0.1586945
## 7 0.6 0.1609745
## 8 0.7 0.1642626
## 9 0.8 0.1644280
## 10 0.9 0.1613305
## 11 1.0 0.1631247
#### Model 6 elastic net hand selected
pkgs <- list("glmnet", "doParallel", "foreach", "pROC")
lapply(pkgs, require, character.only = T)
## [[1]]
## [1] TRUE
##
## [[2]]
## [1] TRUE
##
## [[3]]
## [1] TRUE
##
## [[4]]
## [1] TRUE
registerDoParallel(cores = 4)
n <- nrow(hand_selected__df)
sample <- sample(seq(n), size = n * 0.7, replace = FALSE)
train <- hand_selected__df[sample,]
test <- hand_selected__df[-sample,]
mdlY <- as.matrix(train["SalePrice1"])
#train <- subset(train,select = -c(SalePrice1))
mdlX <- model.matrix(~., data=train[,1:30])[,-1]
y.test <- as.matrix(test["SalePrice1"])
x.test <- model.matrix(~., data=test[,1:30])[,-1]
fit.lasso <- glmnet(mdlX, mdlY, family="gaussian", alpha=1)
fit.ridge <- glmnet(mdlX, mdlY, family="gaussian", alpha=0)
fit.elnet <- glmnet(mdlX, mdlY, family="gaussian", alpha=.5)
for (i in 0:10) {
assign(paste("fit", i, sep=""), cv.glmnet(mdlX, mdlY, type.measure="mse",
alpha=i/10,family="gaussian"))
}
par(mfrow=c(3,2))
plot(fit.lasso, xvar="lambda")
plot(fit10, main="LASSO")
plot(fit.ridge, xvar="lambda")
plot(fit0, main="Ridge")
plot(fit.elnet, xvar="lambda")
plot(fit5, main="Elastic Net")
yhat0 <- predict(fit0, s=fit0$lambda.1se, newx=x.test)
yhat1 <- predict(fit1, s=fit1$lambda.1se, newx=x.test)
yhat2 <- predict(fit2, s=fit2$lambda.1se, newx=x.test)
yhat3 <- predict(fit3, s=fit3$lambda.1se, newx=x.test)
yhat4 <- predict(fit4, s=fit4$lambda.1se, newx=x.test)
yhat5 <- predict(fit5, s=fit5$lambda.1se, newx=x.test)
yhat6 <- predict(fit6, s=fit6$lambda.1se, newx=x.test)
yhat7 <- predict(fit7, s=fit7$lambda.1se, newx=x.test)
yhat8 <- predict(fit8, s=fit8$lambda.1se, newx=x.test)
yhat9 <- predict(fit9, s=fit9$lambda.1se, newx=x.test)
yhat10 <- predict(fit10, s=fit10$lambda.1se, newx=x.test)
mymse0 <- sqrt(mean((log(y.test) - log(yhat0))^2))
mymse1 <- sqrt(mean((log(y.test) - log(yhat1))^2))
mymse2 <- sqrt(mean((log(y.test) - log(yhat2))^2))
mymse3 <- sqrt(mean((log(y.test) - log(yhat3))^2))
mymse4 <- sqrt(mean((log(y.test) - log(yhat4))^2))
mymse5 <- sqrt(mean((log(y.test) - log(yhat5))^2))
mymse6 <- sqrt(mean((log(y.test) - log(yhat6))^2))
mymse7 <- sqrt(mean((log(y.test) - log(yhat7))^2))
mymse8 <- sqrt(mean((log(y.test) - log(yhat8))^2))
mymse9 <- sqrt(mean((log(y.test) - log(yhat9))^2))
mymse10 <-sqrt( mean((log(y.test) - log(yhat10))^2))
as.data.frame(cbind(alpha= c(0,.1,.2,.3,.4,.5,.6,.7,.8,.9,1), myMSE=c(mymse0,mymse1,mymse2,mymse3,mymse4,mymse5,mymse6,mymse7,mymse8,mymse9,mymse10)))
## alpha myMSE
## 1 0.0 0.1818170
## 2 0.1 0.1991496
## 3 0.2 0.1972896
## 4 0.3 0.1918283
## 5 0.4 0.1926680
## 6 0.5 0.1948259
## 7 0.6 0.1934737
## 8 0.7 0.1926466
## 9 0.8 0.1958409
## 10 0.9 0.1858260
## 11 1.0 0.1914759
#as.data.frame(cbind(alpha= c(0,.1,.2,.3,.4,.5,.6,.7,.8,.9,1), #MSE_whole_df=c(mse0,mse1,mse2,mse3,mse4,mse5,mse6,mse7,mse8,mse9,mse10)))
kaggle_test$SalePrice <- NA
filter_names <- colnames(full_df)
kaggle_test <- kaggle_test[,filter_names]
mice_plot <- aggr(kaggle_test, col=c('navyblue','yellow'),
numbers=TRUE, sortVars=TRUE,
labels=names(kaggle_test), cex.axis=.7,
gap=3, ylab=c("Missing data","Pattern"))
## Warning in plot.aggr(res, ...): not enough horizontal space to display
## frequencies
##
## Variables sorted by number of missings:
## Variable Count
## SalePrice 1.000000000
## LotFrontage 0.155586018
## GarageYrBlt 0.053461275
## GarageFinish 0.053461275
## GarageQual 0.053461275
## GarageCond 0.053461275
## GarageType 0.052090473
## BsmtCond 0.030843043
## BsmtQual 0.030157642
## BsmtExposure 0.030157642
## BsmtFinType1 0.028786840
## BsmtFinType2 0.028786840
## MasVnrType 0.010966415
## MasVnrArea 0.010281014
## MSZoning 0.002741604
## BsmtFullBath 0.001370802
## BsmtHalfBath 0.001370802
## Functional 0.001370802
## Exterior2nd 0.000685401
## BsmtFinSF1 0.000685401
## BsmtFinSF2 0.000685401
## BsmtUnfSF 0.000685401
## KitchenQual 0.000685401
## GarageCars 0.000685401
## GarageArea 0.000685401
## SaleType 0.000685401
## MSSubClass 0.000000000
## LotArea 0.000000000
## LotShape 0.000000000
## LandContour 0.000000000
## LotConfig 0.000000000
## LandSlope 0.000000000
## Neighborhood 0.000000000
## Condition1 0.000000000
## Condition2 0.000000000
## BldgType 0.000000000
## HouseStyle 0.000000000
## OverallQual 0.000000000
## OverallCond 0.000000000
## YearBuilt 0.000000000
## YearRemodAdd 0.000000000
## RoofStyle 0.000000000
## RoofMatl 0.000000000
## ExterQual 0.000000000
## ExterCond 0.000000000
## Foundation 0.000000000
## HeatingQC 0.000000000
## CentralAir 0.000000000
## Electrical 0.000000000
## X1stFlrSF 0.000000000
## X2ndFlrSF 0.000000000
## LowQualFinSF 0.000000000
## FullBath 0.000000000
## HalfBath 0.000000000
## BedroomAbvGr 0.000000000
## KitchenAbvGr 0.000000000
## TotRmsAbvGrd 0.000000000
## Fireplaces 0.000000000
## PavedDrive 0.000000000
## WoodDeckSF 0.000000000
## OpenPorchSF 0.000000000
## EnclosedPorch 0.000000000
## X3SsnPorch 0.000000000
## ScreenPorch 0.000000000
## PoolArea 0.000000000
## MiscVal 0.000000000
## MoSold 0.000000000
## YrSold 0.000000000
## SaleCondition 0.000000000
my_names <- names(Filter(is.factor, full_df))
my_names2 <- names(Filter(is.numeric, full_df))
## get same types
kaggle_test[,my_names] <- lapply( kaggle_test[,my_names],factor)
#kaggle_test[,my_names2] <- lapply( kaggle_test[,my_names2],numeric)
# #impute missing values, using all parameters as default values
# kaggle_test<- missForest(kaggle_test)
# kaggle_test<- kaggle_test$ximp
# save(kaggle_test,file="Kaggle_ready_predict.Rda")
#loads as kaggle_test
load("Kaggle_ready_predict.Rda")
predictions <- predict(mod2, kaggle_test)
predictions_df <- as.data.frame(predictions)
predictions_df$Id_column <- Id_column
colnames(predictions_df) <- c("SalePrice","Id")
predictions_df2 <- predictions_df[,c("Id","SalePrice")]
write.csv(predictions_df2,'Jherman_submission.csv')
# mod2 <- lm(data=full_df,SalePrice~.)
# mod2_sum <- summary(mod2)
# alias(mod2)
# predictions <- predict(mod2, test)
# model_3_mse <- rmse(log(test$SalePrice), log(predictions))## .473676544
# vif(mod2)
# plot(mod2)
#df_model <- subset(df_44, select = -c(Id,YearRemodAdd))
#fit2 <- lm(data=df_model,SalePrice1~.)
#summary(fit2)
#
# #step.model <- stepAIC(fit2, direction = "both",
# # trace = FALSE)
# #summary(step.model)
# #crPlots(step.model)
#
#
#
#
#
# ## POssible changes
# #yes
# summary(lm(data=hand_selected__df,SalePrice1~LotArea))
# summary(lm(data=hand_selected__df,SalePrice1~log(LotArea)))
#
# #no
# summary(lm(data=hand_selected__df,SalePrice1~TotalBsmtSF))
# summary(lm(data=hand_selected__df,SalePrice1~log(TotalBsmtSF+.0001)))
# plot(data=hand_selected__df,SalePrice1~GrLivArea)
# plot(data=hand_selected__df,SalePrice1~I(GrLivArea^2))
# plot(data=hand_selected__df,SalePrice1~log(GrLivArea))
# plot(data=hand_selected__df,SalePrice1~BsmtFinSF1)
# plot(data=hand_selected__df,SalePrice1~I(BsmtFinSF1^2))
# plot(data=hand_selected__df,SalePrice1~log(BsmtFinSF1+.001))
# summary(lm(data=hand_selected__df,SalePrice1~GrLivArea+I(GrLivArea^2)))
# summary(lm(data=hand_selected__df,SalePrice1~log(GrLivArea+.001)))
#
# ## TotalBsmtSF GrlivingArea Lot area
# #avPlots(step.model)
#
#
my_names2 <- names(Filter(is.numeric, full_df))
get_numeric <- sapply(full_df, is.numeric)
numeric_df <- full_df[,my_names2]
numeric_2 <- subset(numeric_df,select = -c(X1stFlrSF,GarageCars))
## Nan was produced so lets run simple alias check and vif check
numeric_fit <- lm( SalePrice~., data=numeric_2)
alias(numeric_fit)
## Model :
## SalePrice ~ MSSubClass + LotFrontage + LotArea + OverallQual +
## OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 +
## BsmtFinSF2 + BsmtUnfSF + X2ndFlrSF + LowQualFinSF + BsmtFullBath +
## BsmtHalfBath + FullBath + HalfBath + BedroomAbvGr + KitchenAbvGr +
## TotRmsAbvGrd + Fireplaces + GarageYrBlt + GarageArea + WoodDeckSF +
## OpenPorchSF + EnclosedPorch + X3SsnPorch + ScreenPorch +
## PoolArea + MiscVal + MoSold + YrSold
vif(numeric_fit)
## MSSubClass LotFrontage LotArea OverallQual OverallCond
## 1.722172 1.789459 1.275532 3.187443 1.566937
## YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2
## 5.519956 2.376252 1.390745 3.922340 1.367020
## BsmtUnfSF X2ndFlrSF LowQualFinSF BsmtFullBath BsmtHalfBath
## 3.123205 4.155479 1.121306 2.211703 1.150635
## FullBath HalfBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd
## 2.739210 2.154137 2.323749 1.584425 4.073303
## Fireplaces GarageYrBlt GarageArea WoodDeckSF OpenPorchSF
## 1.496814 4.653908 2.168518 1.206311 1.209374
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea MiscVal
## 1.284538 1.022178 1.108475 1.097447 1.020406
## MoSold YrSold
## 1.049460 1.050457
plot(numeric_fit)
summary(numeric_fit)
##
## Call:
## lm(formula = SalePrice ~ ., data = numeric_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -473869 -17657 -2977 13553 333242
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.806e+05 1.452e+06 0.538 0.590862
## MSSubClass -1.809e+02 2.900e+01 -6.239 5.80e-10 ***
## LotFrontage -2.564e+01 5.458e+01 -0.470 0.638598
## LotArea 4.893e-01 1.058e-01 4.626 4.07e-06 ***
## OverallQual 1.881e+04 1.207e+03 15.582 < 2e-16 ***
## OverallCond 3.627e+03 1.052e+03 3.449 0.000579 ***
## YearBuilt 2.700e+02 7.272e+01 3.713 0.000213 ***
## YearRemodAdd 2.259e+02 6.980e+01 3.236 0.001239 **
## MasVnrArea 3.666e+01 6.095e+00 6.015 2.28e-09 ***
## BsmtFinSF1 3.758e+01 4.059e+00 9.258 < 2e-16 ***
## BsmtFinSF2 2.730e+01 6.776e+00 4.029 5.89e-05 ***
## BsmtUnfSF 2.583e+01 3.739e+00 6.907 7.41e-12 ***
## X2ndFlrSF 2.633e+01 4.366e+00 6.032 2.06e-09 ***
## LowQualFinSF 1.736e+01 2.036e+01 0.853 0.393863
## BsmtFullBath 8.360e+03 2.679e+03 3.120 0.001844 **
## BsmtHalfBath 1.542e+03 4.200e+03 0.367 0.713635
## FullBath 1.040e+04 2.809e+03 3.704 0.000220 ***
## HalfBath -6.135e+02 2.729e+03 -0.225 0.822118
## BedroomAbvGr -1.068e+04 1.747e+03 -6.115 1.24e-09 ***
## KitchenAbvGr -8.565e+03 5.341e+03 -1.604 0.108983
## TotRmsAbvGrd 9.593e+03 1.161e+03 8.264 3.18e-16 ***
## Fireplaces 7.407e+03 1.774e+03 4.175 3.17e-05 ***
## GarageYrBlt -5.961e+01 8.113e+01 -0.735 0.462629
## GarageArea 3.757e+01 6.439e+00 5.835 6.66e-09 ***
## WoodDeckSF 3.149e+01 8.192e+00 3.843 0.000127 ***
## OpenPorchSF 1.612e-01 1.552e+01 0.010 0.991711
## EnclosedPorch 1.360e+01 1.734e+01 0.784 0.432952
## X3SsnPorch 3.436e+01 3.224e+01 1.066 0.286711
## ScreenPorch 6.015e+01 1.765e+01 3.408 0.000674 ***
## PoolArea -1.677e+01 2.438e+01 -0.688 0.491546
## MiscVal -1.535e+00 1.903e+00 -0.806 0.420102
## MoSold 4.335e+00 3.542e+02 0.012 0.990238
## YrSold -8.493e+02 7.215e+02 -1.177 0.239305
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35710 on 1427 degrees of freedom
## Multiple R-squared: 0.8024, Adjusted R-squared: 0.7979
## F-statistic: 181.1 on 32 and 1427 DF, p-value: < 2.2e-16
n <- nrow(numeric_2)
sample <- sample(seq(n), size = n * 0.7, replace = FALSE)
train <- numeric_2[sample,]
test <- numeric_2[-sample,]
mdlY <- as.matrix(train["SalePrice"])
#train <- subset(train,select = -c(SalePrice1))
mdlX <- model.matrix(~., data=train[,1:32])[,-1]
colnames(numeric_2)
## [1] "MSSubClass" "LotFrontage" "LotArea" "OverallQual"
## [5] "OverallCond" "YearBuilt" "YearRemodAdd" "MasVnrArea"
## [9] "BsmtFinSF1" "BsmtFinSF2" "BsmtUnfSF" "X2ndFlrSF"
## [13] "LowQualFinSF" "BsmtFullBath" "BsmtHalfBath" "FullBath"
## [17] "HalfBath" "BedroomAbvGr" "KitchenAbvGr" "TotRmsAbvGrd"
## [21] "Fireplaces" "GarageYrBlt" "GarageArea" "WoodDeckSF"
## [25] "OpenPorchSF" "EnclosedPorch" "X3SsnPorch" "ScreenPorch"
## [29] "PoolArea" "MiscVal" "MoSold" "YrSold"
## [33] "SalePrice"
y.test <- as.matrix(test["SalePrice"])
x.test <- model.matrix(~., data=test[,1:32])[,-1]
fit.lasso <- glmnet(mdlX, mdlY, family="gaussian", alpha=1)
fit.ridge <- glmnet(mdlX, mdlY, family="gaussian", alpha=0)
fit.elnet <- glmnet(mdlX, mdlY, family="gaussian", alpha=.5)
for (i in 0:10) {
assign(paste("fit", i, sep=""), cv.glmnet(mdlX, mdlY, type.measure="mse",
alpha=i/10,family="gaussian"))
}
par(mfrow=c(3,2))
# For plotting options, type '?plot.glmnet' in R console
plot(fit.lasso, xvar="lambda")
plot(fit10, main="LASSO")
plot(fit.ridge, xvar="lambda")
plot(fit0, main="Ridge")
plot(fit.elnet, xvar="lambda")
plot(fit5, main="Elastic Net")
yhat0 <- predict(fit0, s=fit0$lambda.1se, newx=x.test)
yhat1 <- predict(fit1, s=fit1$lambda.1se, newx=x.test)
yhat2 <- predict(fit2, s=fit2$lambda.1se, newx=x.test)
yhat3 <- predict(fit3, s=fit3$lambda.1se, newx=x.test)
yhat4 <- predict(fit4, s=fit4$lambda.1se, newx=x.test)
yhat5 <- predict(fit5, s=fit5$lambda.1se, newx=x.test)
yhat6 <- predict(fit6, s=fit6$lambda.1se, newx=x.test)
yhat7 <- predict(fit7, s=fit7$lambda.1se, newx=x.test)
yhat8 <- predict(fit8, s=fit8$lambda.1se, newx=x.test)
yhat9 <- predict(fit9, s=fit9$lambda.1se, newx=x.test)
yhat10 <- predict(fit10, s=fit10$lambda.1se, newx=x.test)
mse0 <- sqrt(mean((log(y.test) - log(yhat0))^2))
mse1 <- sqrt(mean((log(y.test) - log(yhat1))^2))
mse2 <- sqrt(mean((log(y.test) - log(yhat2))^2))
mse3 <- sqrt(mean((log(y.test) - log(yhat3))^2))
mse4 <- sqrt(mean((log(y.test) - log(yhat4))^2))
mse5 <- sqrt(mean((log(y.test) - log(yhat5))^2))
mse6 <- sqrt(mean((log(y.test) - log(yhat6))^2))
## Warning in log(yhat6): NaNs produced
mse7 <- sqrt(mean((log(y.test) - log(yhat7))^2))
## Warning in log(yhat7): NaNs produced
mse8 <- sqrt(mean((log(y.test) - log(yhat8))^2))
## Warning in log(yhat8): NaNs produced
mse9 <- sqrt(mean((log(y.test) - log(yhat9))^2))
## Warning in log(yhat9): NaNs produced
mse10 <-sqrt( mean((log(y.test) - log(yhat10))^2))
## Warning in log(yhat10): NaNs produced
as.data.frame(cbind(alpha= c(0,.1,.2,.3,.4,.5,.6,.7,.8,.9,1), MSE=c(mse0,mse1,mse2,mse3,mse4,mse5,mse6,mse7,mse8,mse9,mse10)))
## alpha MSE
## 1 0.0 0.1846513
## 2 0.1 0.1859044
## 3 0.2 0.1922189
## 4 0.3 0.1995716
## 5 0.4 0.2074744
## 6 0.5 0.2196102
## 7 0.6 NaN
## 8 0.7 NaN
## 9 0.8 NaN
## 10 0.9 NaN
## 11 1.0 NaN
# library(glmulti)
# my_names <- colnames(hand_selected__df)[1:30]
# my_fit <- glmulti('SalePrice1',my_names,hand_selected__df,level=1)
## drop multicollnear col
# df_22 <- subset(df_22, select = -c(TotalBsmtSF))
# df_22 <- subset(df_22, select = -c(GrLivArea))
# df_model <- subset(df_model, select = -c(Electrical,Exterior2nd,BldgType))
# model 2 on whole dataset
# library(MASS)
# ind <- sapply(full_df, is.numeric)
# full_df[ind] <- lapply(full_df[ind], scale)
# ind <- sapply(hand_selected__df, is.numeric)
# hand_selected__df[ind] <- lapply(hand_selected__df[ind], scale)