library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ stringr 1.4.0
## ✓ tidyr   1.1.4     ✓ forcats 0.5.1
## ✓ readr   2.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidymodels)
## Registered S3 method overwritten by 'tune':
##   method                   from   
##   required_pkgs.model_spec parsnip
## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.4 ──
## ✓ broom        0.7.9      ✓ rsample      0.1.1 
## ✓ dials        0.0.10     ✓ tune         0.1.6 
## ✓ infer        1.0.0      ✓ workflows    0.2.4 
## ✓ modeldata    0.1.1      ✓ workflowsets 0.1.0 
## ✓ parsnip      0.1.7      ✓ yardstick    0.0.9 
## ✓ recipes      0.1.17
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

Descriptive and Inferential Statistics. Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any three quantitative variables in the dataset. Discuss the meaning of your analysis. Would you be worried about familywise`error? Why or why not? 5 points

training<- read_csv("./house-prices-advanced-regression-techniques/train.csv")
## Rows: 1460 Columns: 81
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (43): MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConf...
## dbl (38): Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, Ye...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
testing<- read_csv("./house-prices-advanced-regression-techniques/test.csv")
## Rows: 1459 Columns: 80
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (43): MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConf...
## dbl (37): Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, Ye...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
combined_data <- bind_rows(training,testing)
data_view<- DataExplorer::plot_missing(training, missing_only = T)

cols_to_remove <- data_view$data %>% filter(Band %in% c("Remove", "Bad",'Good','OK')) %>% pull(feature)
bad_data <- data.frame(cols_to_remove)
training <- training%>% dplyr::select(-c(bad_data$cols_to_remove))
testing <- testing%>% dplyr::select(-c(bad_data$cols_to_remove))
combined_data <- combined_data %>% dplyr::select(-c(bad_data$cols_to_remove))
ggplot(training, aes(x=SalePrice)) + 
 geom_histogram(aes(y=..density..), colour="black", fill="white",bins=50)+
 geom_density(alpha=.2, fill="purple")+ 
  labs(x = "", y = "")

ggplot(training, aes(sample = SalePrice))+ 
  stat_qq()+
  stat_qq_line()+ 
  labs(x = "", y = "")

### Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any three quantitative variables in the dataset.

training %>% ggpairs(columns = c('GrLivArea','TotalBsmtSF','SalePrice'),
                     upper = list(continuous = wrap('cor',size=8)))

### Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval.

cor.test(training$TotalBsmtSF,training$GrLivArea,method = "pearson",conf.level = 0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  training$TotalBsmtSF and training$GrLivArea
## t = 19.503, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.4278380 0.4810855
## sample estimates:
##       cor 
## 0.4548682
cor.test(training$GrLivArea,training$SalePrice,method = "pearson",conf.level = 0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  training$GrLivArea and training$SalePrice
## t = 38.348, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.6915087 0.7249450
## sample estimates:
##       cor 
## 0.7086245
cor.test(training$TotalBsmtSF,training$SalePrice,method = "pearson",conf.level = 0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  training$TotalBsmtSF and training$SalePrice
## t = 29.671, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.5922142 0.6340846
## sample estimates:
##       cor 
## 0.6135806

Discuss the meaning of your analysis. Would you be worried about familywise`error? Why or why not? 5 points

Basically what the correlation stats are telling us is that there is a relationship between square footage and sale price. For every square foot of basement the price increases by $61. While for every square foot of above ground living area the sale price increases by $71.

The square footage of the basement and above ground living area has a correlation of .45. Which is a combination of all floor levels above the ground. This makes sense because basements usually have a similar area as the first floor. While some houses may have two above ground floors, and some might not have a basement at all nor a second level, the correlation of .45 being roughly half of the total above ground living area total makes perfect sense.

A family wise error is always of concern regardless of the statistics and correlation because that is just the nature of statistics. There is always a possibility of having a type I or type II error for any test that is ran.

Linear Algebra and Correlation. Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix. 5 points

three_df <- data.frame(training$TotalBsmtSF,training$GrLivArea,training$SalePrice)
cor_1 <- cor(three_df)
precision_cor <- solve(cor_1)
precision_cor
##                      training.TotalBsmtSF training.GrLivArea training.SalePrice
## training.TotalBsmtSF           1.60588442        -0.06473842         -0.9394642
## training.GrLivArea            -0.06473842         2.01124151         -1.3854927
## training.SalePrice            -0.93946422        -1.38549273          2.5582310
cor_1_prec <- round(cor_1 %*% precision_cor)
cor_1_prec
##                      training.TotalBsmtSF training.GrLivArea training.SalePrice
## training.TotalBsmtSF                    1                  0                  0
## training.GrLivArea                      0                  1                  0
## training.SalePrice                      0                  0                  1
prec_cor_1 <- round(precision_cor%*%cor_1)
prec_cor_1
##                      training.TotalBsmtSF training.GrLivArea training.SalePrice
## training.TotalBsmtSF                    1                  0                  0
## training.GrLivArea                      0                  1                  0
## training.SalePrice                      0                  0                  1

Conduct LU decomposition on the matrix

library(matrixcalc)
lu_decomp <- lu.decomposition(cor_1)
print('Lower')
## [1] "Lower"
lu_decomp$L
##           [,1]      [,2] [,3]
## [1,] 1.0000000 0.0000000    0
## [2,] 0.4548682 1.0000000    0
## [3,] 0.6135806 0.5415823    1
print('Upper')
## [1] "Upper"
lu_decomp$U
##      [,1]      [,2]      [,3]
## [1,]    1 0.4548682 0.6135806
## [2,]    0 0.7930949 0.4295262
## [3,]    0 0.0000000 0.3908951

Calculus-Based Probability & Statistics. Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R- devel/library/MASS/html/fitdistr.html ). Find the optimal value of λ for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, λ)). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5 th and 95 th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss. 10 points

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select

Select a variable in the Kaggle.com training dataset that is skewed to the right

ggplot(training,aes(x=TotalBsmtSF)) + 
  geom_histogram(aes(y=..density..),fill='darkgreen',bins=40)+
  geom_density()

### Minimum value is absolutely above zero.

## [1] "Minimum Value 0"

Run fitdistr to fit an exponential probability density function

tbsmt_exp_dist <- fitdistr(training$TotalBsmtSF,'exponential')

Find the optimal value of λ for this distribution, and then take 1000 samples from this exponential distribution using this value

tbsmt_lamb <- tbsmt_exp_dist$estimate
tbsmt_lamb
##         rate 
## 0.0009456896
tbsmt_sample <- rexp(1000,tbsmt_lamb)
tbsmt_df <- data.frame(tbsmt_sample)

Plot a histogram and compare it with a histogram of your original variable

The original histogram of total basement square footage was roughly normal with a high skewedness to the right. Now the data is in exponential distribution form. Neither distribution is perfect for their respective form.

ggplot(tbsmt_df,aes(x=tbsmt_sample)) + 
  geom_histogram(aes(y=..density..),fill='blue',bins=40)+
  geom_density()

### Using the exponential pdf, find the 5 th and 95 th percentiles using the cumulative distribution function (CDF).

round(qexp(c(0.05,0.95), tbsmt_lamb),3)
## [1]   54.239 3167.776

Also generate a 95% confidence interval from the empirical data, assuming normality.

round(qnorm(c(0.025, 0.975), mean = mean(training$TotalBsmtSF), sd = sd(training$TotalBsmtSF)),3)
## [1]  197.583 1917.276

Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

quantile(training$TotalBsmtSF, c(0.05, 0.95))
##     5%    95% 
##  519.3 1753.0
training %>% 
  ggplot(aes(x = SalePrice)) +
  geom_histogram()

### Try to normalize Sales Price

training %>% 
  ggplot(aes(x = log(SalePrice))) +
  geom_histogram()

Modeling.

Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis.

training$TotalLivArea <- training$TotalBsmtSF + training$GrLivArea
training <- training %>% dplyr::select(.,-c(Id,TotalBsmtSF,GrLivArea))%>%mutate_if(is.character, as.factor)

testing$TotalLivArea <- testing$TotalBsmtSF + testing$GrLivArea
testing <- testing %>% dplyr::select(.,-c(Id,TotalBsmtSF,GrLivArea))%>%mutate_if(is.character, as.factor)

combined_data$TotalLivArea <- combined_data$TotalBsmtSF + combined_data$GrLivArea
combined_data <- combined_data %>% dplyr::select(.,-c(Id,TotalBsmtSF,GrLivArea))%>%mutate_if(is.character, as.factor)
train_index <- seq_len(nrow(training))

sales_recipe <- combined_data %>% 
  recipe(SalePrice ~ .) %>%
  step_log(SalePrice) %>% 
  step_impute_mode(all_nominal()) %>% 
  step_dummy(all_nominal()) %>% 
  step_impute_mean(all_predictors()) %>%
  step_normalize(all_predictors()) %>% 
  prep(training = combined_data)

juiced_data <- juice(sales_recipe)


training2 <- juiced_data[train_index,]
testing2 <- juiced_data[-train_index,]
val_set <- validation_split(training2, prop = 0.8)

tune_model <- linear_reg(penalty = tune(), mixture = 1) %>%
  set_engine("glmnet")

sales_grid <- grid_regular(penalty(), levels = 100)

tuneflow <- workflow() %>%
  add_model(tune_model) %>% 
  add_formula(SalePrice ~ .)

tuning_result <- tuneflow %>% 
  tune_grid(val_set,
            grid = sales_grid,
            metrics = metric_set(rmse))

tuning_result %>% 
  collect_metrics()
best_tune <- tuning_result %>% select_best(metric = "rmse")
sale_regres = linear_reg(penalty=best_tune$penalty,mixture=.25) %>% 
  set_engine("glmnet") %>% 
  fit(SalePrice ~ ., data = training2)

Final analysis

There were more things that I would have liked to have tidied up a bit. I think if I could have combined more columns that I could improve the score of .13771.

sales_pred = predict(sale_regres, new_data = testing2)
Sales_predictions <- data.frame('Id'=test_index,'SalePrice' = exp(sales_pred$.pred))
Sales_predictions