library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ stringr 1.4.0
## ✓ tidyr 1.1.4 ✓ forcats 0.5.1
## ✓ readr 2.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidymodels)
## Registered S3 method overwritten by 'tune':
## method from
## required_pkgs.model_spec parsnip
## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.4 ──
## ✓ broom 0.7.9 ✓ rsample 0.1.1
## ✓ dials 0.0.10 ✓ tune 0.1.6
## ✓ infer 1.0.0 ✓ workflows 0.2.4
## ✓ modeldata 0.1.1 ✓ workflowsets 0.1.0
## ✓ parsnip 0.1.7 ✓ yardstick 0.0.9
## ✓ recipes 0.1.17
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x scales::discard() masks purrr::discard()
## x dplyr::filter() masks stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
training<- read_csv("./house-prices-advanced-regression-techniques/train.csv")
## Rows: 1460 Columns: 81
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (43): MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConf...
## dbl (38): Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, Ye...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
testing<- read_csv("./house-prices-advanced-regression-techniques/test.csv")
## Rows: 1459 Columns: 80
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (43): MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConf...
## dbl (37): Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, Ye...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
combined_data <- bind_rows(training,testing)
data_view<- DataExplorer::plot_missing(training, missing_only = T)
cols_to_remove <- data_view$data %>% filter(Band %in% c("Remove", "Bad",'Good','OK')) %>% pull(feature)
bad_data <- data.frame(cols_to_remove)
training <- training%>% dplyr::select(-c(bad_data$cols_to_remove))
testing <- testing%>% dplyr::select(-c(bad_data$cols_to_remove))
combined_data <- combined_data %>% dplyr::select(-c(bad_data$cols_to_remove))
ggplot(training, aes(x=SalePrice)) +
geom_histogram(aes(y=..density..), colour="black", fill="white",bins=50)+
geom_density(alpha=.2, fill="purple")+
labs(x = "", y = "")
ggplot(training, aes(sample = SalePrice))+
stat_qq()+
stat_qq_line()+
labs(x = "", y = "")
### Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any three quantitative variables in the dataset.
training %>% ggpairs(columns = c('GrLivArea','TotalBsmtSF','SalePrice'),
upper = list(continuous = wrap('cor',size=8)))
### Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval.
cor.test(training$TotalBsmtSF,training$GrLivArea,method = "pearson",conf.level = 0.80)
##
## Pearson's product-moment correlation
##
## data: training$TotalBsmtSF and training$GrLivArea
## t = 19.503, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.4278380 0.4810855
## sample estimates:
## cor
## 0.4548682
cor.test(training$GrLivArea,training$SalePrice,method = "pearson",conf.level = 0.80)
##
## Pearson's product-moment correlation
##
## data: training$GrLivArea and training$SalePrice
## t = 38.348, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.6915087 0.7249450
## sample estimates:
## cor
## 0.7086245
cor.test(training$TotalBsmtSF,training$SalePrice,method = "pearson",conf.level = 0.80)
##
## Pearson's product-moment correlation
##
## data: training$TotalBsmtSF and training$SalePrice
## t = 29.671, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.5922142 0.6340846
## sample estimates:
## cor
## 0.6135806
Basically what the correlation stats are telling us is that there is a relationship between square footage and sale price. For every square foot of basement the price increases by $61. While for every square foot of above ground living area the sale price increases by $71.
The square footage of the basement and above ground living area has a correlation of .45. Which is a combination of all floor levels above the ground. This makes sense because basements usually have a similar area as the first floor. While some houses may have two above ground floors, and some might not have a basement at all nor a second level, the correlation of .45 being roughly half of the total above ground living area total makes perfect sense.
A family wise error is always of concern regardless of the statistics and correlation because that is just the nature of statistics. There is always a possibility of having a type I or type II error for any test that is ran.
three_df <- data.frame(training$TotalBsmtSF,training$GrLivArea,training$SalePrice)
cor_1 <- cor(three_df)
precision_cor <- solve(cor_1)
precision_cor
## training.TotalBsmtSF training.GrLivArea training.SalePrice
## training.TotalBsmtSF 1.60588442 -0.06473842 -0.9394642
## training.GrLivArea -0.06473842 2.01124151 -1.3854927
## training.SalePrice -0.93946422 -1.38549273 2.5582310
cor_1_prec <- round(cor_1 %*% precision_cor)
cor_1_prec
## training.TotalBsmtSF training.GrLivArea training.SalePrice
## training.TotalBsmtSF 1 0 0
## training.GrLivArea 0 1 0
## training.SalePrice 0 0 1
prec_cor_1 <- round(precision_cor%*%cor_1)
prec_cor_1
## training.TotalBsmtSF training.GrLivArea training.SalePrice
## training.TotalBsmtSF 1 0 0
## training.GrLivArea 0 1 0
## training.SalePrice 0 0 1
library(matrixcalc)
lu_decomp <- lu.decomposition(cor_1)
print('Lower')
## [1] "Lower"
lu_decomp$L
## [,1] [,2] [,3]
## [1,] 1.0000000 0.0000000 0
## [2,] 0.4548682 1.0000000 0
## [3,] 0.6135806 0.5415823 1
print('Upper')
## [1] "Upper"
lu_decomp$U
## [,1] [,2] [,3]
## [1,] 1 0.4548682 0.6135806
## [2,] 0 0.7930949 0.4295262
## [3,] 0 0.0000000 0.3908951
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
ggplot(training,aes(x=TotalBsmtSF)) +
geom_histogram(aes(y=..density..),fill='darkgreen',bins=40)+
geom_density()
### Minimum value is absolutely above zero.
## [1] "Minimum Value 0"
tbsmt_exp_dist <- fitdistr(training$TotalBsmtSF,'exponential')
tbsmt_lamb <- tbsmt_exp_dist$estimate
tbsmt_lamb
## rate
## 0.0009456896
tbsmt_sample <- rexp(1000,tbsmt_lamb)
tbsmt_df <- data.frame(tbsmt_sample)
The original histogram of total basement square footage was roughly normal with a high skewedness to the right. Now the data is in exponential distribution form. Neither distribution is perfect for their respective form.
ggplot(tbsmt_df,aes(x=tbsmt_sample)) +
geom_histogram(aes(y=..density..),fill='blue',bins=40)+
geom_density()
### Using the exponential pdf, find the 5 th and 95 th percentiles using the cumulative distribution function (CDF).
round(qexp(c(0.05,0.95), tbsmt_lamb),3)
## [1] 54.239 3167.776
round(qnorm(c(0.025, 0.975), mean = mean(training$TotalBsmtSF), sd = sd(training$TotalBsmtSF)),3)
## [1] 197.583 1917.276
quantile(training$TotalBsmtSF, c(0.05, 0.95))
## 5% 95%
## 519.3 1753.0
training %>%
ggplot(aes(x = SalePrice)) +
geom_histogram()
### Try to normalize Sales Price
training %>%
ggplot(aes(x = log(SalePrice))) +
geom_histogram()
training$TotalLivArea <- training$TotalBsmtSF + training$GrLivArea
training <- training %>% dplyr::select(.,-c(Id,TotalBsmtSF,GrLivArea))%>%mutate_if(is.character, as.factor)
testing$TotalLivArea <- testing$TotalBsmtSF + testing$GrLivArea
testing <- testing %>% dplyr::select(.,-c(Id,TotalBsmtSF,GrLivArea))%>%mutate_if(is.character, as.factor)
combined_data$TotalLivArea <- combined_data$TotalBsmtSF + combined_data$GrLivArea
combined_data <- combined_data %>% dplyr::select(.,-c(Id,TotalBsmtSF,GrLivArea))%>%mutate_if(is.character, as.factor)
train_index <- seq_len(nrow(training))
sales_recipe <- combined_data %>%
recipe(SalePrice ~ .) %>%
step_log(SalePrice) %>%
step_impute_mode(all_nominal()) %>%
step_dummy(all_nominal()) %>%
step_impute_mean(all_predictors()) %>%
step_normalize(all_predictors()) %>%
prep(training = combined_data)
juiced_data <- juice(sales_recipe)
training2 <- juiced_data[train_index,]
testing2 <- juiced_data[-train_index,]
val_set <- validation_split(training2, prop = 0.8)
tune_model <- linear_reg(penalty = tune(), mixture = 1) %>%
set_engine("glmnet")
sales_grid <- grid_regular(penalty(), levels = 100)
tuneflow <- workflow() %>%
add_model(tune_model) %>%
add_formula(SalePrice ~ .)
tuning_result <- tuneflow %>%
tune_grid(val_set,
grid = sales_grid,
metrics = metric_set(rmse))
tuning_result %>%
collect_metrics()
best_tune <- tuning_result %>% select_best(metric = "rmse")
sale_regres = linear_reg(penalty=best_tune$penalty,mixture=.25) %>%
set_engine("glmnet") %>%
fit(SalePrice ~ ., data = training2)
There were more things that I would have liked to have tidied up a bit. I think if I could have combined more columns that I could improve the score of .13771.
sales_pred = predict(sale_regres, new_data = testing2)
Sales_predictions <- data.frame('Id'=test_index,'SalePrice' = exp(sales_pred$.pred))
Sales_predictions