SBellows_605

set.seed(42)

X <- runif(10000, min = 1, max = 8)
Y <- rnorm(10000, mean = 4.5, sd = 4.5)
x <- median(X)
y <- quantile(Y, .25)

#A)

group <- X[X > y]
numerator <- X[X > x]

answer_a <- length(numerator)/length(group)
print(answer_a)

## [1] 0.5335041

# If X > y, there is a 53% chance that X is also greater than X

#B)

answer_b <- .5 * .75
print(answer_b)

## [1] 0.375

# There is a 37.5% chance that both X is greater than x and Y is greater than y

#C)

answer_c <- 1 - answer_a
print(answer_c)

## [1] 0.4664959

# If X > y, there is a 47% chance that X is also less than x.

library(tidyverse)

## -- Attaching packages ------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.0     v purrr   0.3.4
## v tibble  3.0.1     v dplyr   0.8.5
## v tidyr   1.0.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## Warning: package 'ggplot2' was built under R version 3.6.3

## Warning: package 'tibble' was built under R version 3.6.3

## Warning: package 'tidyr' was built under R version 3.6.3

## Warning: package 'purrr' was built under R version 3.6.3

## Warning: package 'dplyr' was built under R version 3.6.3

## Warning: package 'forcats' was built under R version 3.6.3

## -- Conflicts ---------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

rand_df <- data.frame(Xvar = X, Yvar = Y)
rand_df$X_over <- rand_df$Xvar > x
rand_df$Y_over <- rand_df$Yvar > y

rand_table <- table(rand_df$X_over, rand_df$Y_over)

X_prob <- sum(rand_table[2, ])/sum(rand_table)
Y_prob <- sum(rand_table[ ,2])/sum(rand_table)
joint_prob <- rand_table[2, 2]/sum(rand_table)
X_Y_prob <- X_prob * Y_prob

print(joint_prob)

## [1] 0.3732

print(X_Y_prob)

## [1] 0.375

# The joint probability and multiplication of the marginal probabilities are extremely similar, off by only .003. This indicates the events are independent, which makes sense as they were created by independent simulations.

fisher_test <- fisher.test(rand_table)
chisq_test <- chisq.test(rand_table)

print(fisher_test$p.value)

## [1] 0.4189266

print(chisq_test$p.value)

## [1] 0.4189234

The main difference between Fischer’s Exact Test and the Chi Square Test is that Fischer’s test is designed for small sample sizes whereas the chi square test is based on approximation and requires that all cells have a minimum count of 5.

The more appropriate test here is the chi-square test, given that we have a large sample size and meet the requirement of a minimum of 5 counts in each cell.

The Fisher test gives a p-value of .41 and the chi-square test also gives a p-value of .41. As such, we fail to reject the null hypothesis that the two groups are independent.

train1 <- read.csv('data/house-prices-advanced-regression-techniques/train.csv')

train1 %>% ggplot(aes(x = SalePrice)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

train1 %>% ggplot(aes(x = LotArea)) + geom_boxplot()

train1 %>% ggplot(aes(x = GarageArea)) + geom_boxplot()

train_3 <- train1 %>% select(c(LotArea, GarageArea, SalePrice))

print(summary(train_3))

##     LotArea         GarageArea       SalePrice     
##  Min.   :  1300   Min.   :   0.0   Min.   : 34900  
##  1st Qu.:  7554   1st Qu.: 334.5   1st Qu.:129975  
##  Median :  9478   Median : 480.0   Median :163000  
##  Mean   : 10517   Mean   : 473.0   Mean   :180921  
##  3rd Qu.: 11602   3rd Qu.: 576.0   3rd Qu.:214000  
##  Max.   :215245   Max.   :1418.0   Max.   :755000

corr_matrix <- cor(train_3)
pairs(train_3)

cor_1 <- cor.test(train_3$LotArea, train_3$GarageArea, conf.level = .8 + .2/3)
cor_2 <- cor.test(train_3$LotArea, train_3$SalePrice, conf.level = .8 + .2/3)
cor_3 <- cor.test(train_3$GarageArea, train_3$SalePrice, conf.level = .8 + .2/3)

print(c(cor_1$conf.int, cor_2$conf.int, cor_3$conf.int))

## [1] 0.1421050 0.2181612 0.2268909 0.3000372 0.5987992 0.6468854

Since none of the confidence intervals contain 0, all of these correlation coefficients are statistically significant which means that the two independent variables are predictors of the target variable. I am not worried about familywise error as I applied the bonferroni correction to the confidence levels.

library(matlib)

## Warning: package 'matlib' was built under R version 3.6.3

library(matrixcalc)

## 
## Attaching package: 'matrixcalc'

## The following object is masked from 'package:matlib':
## 
##     vec

precision <- inv(corr_matrix)

print(precision %*% corr_matrix)

##           LotArea   GarageArea    SalePrice
## [1,] 1.000000e+00 5.244495e-09 5.718851e-09
## [2,] 2.588825e-09 1.000000e+00 4.888090e-12
## [3,] 4.697697e-09 1.224164e-09 1.000000e+00

print(corr_matrix %*% precision)

##                                                  
## LotArea    1.000000e+00 2.588825e-09 4.697697e-09
## GarageArea 5.244495e-09 1.000000e+00 1.224164e-09
## SalePrice  5.718851e-09 4.888090e-12 1.000000e+00

# Both multiplications yield the identity matrix (with some roudning error)

LU_decomp <- lu.decomposition(corr_matrix)
L <- LU_decomp$L
U <- LU_decomp$U

print(L)

##           [,1]      [,2] [,3]
## [1,] 1.0000000 0.0000000    0
## [2,] 0.1804028 1.0000000    0
## [3,] 0.2638434 0.5952044    1

print(U)

##      [,1]      [,2]      [,3]
## [1,]    1 0.1804028 0.2638434
## [2,]    0 0.9674548 0.5758334
## [3,]    0 0.0000000 0.5876481

# We will continue to use LotArea as it is relevant to the target variable and has a significant right skew according to the above boxplot.
library(MASS)

## Warning: package 'MASS' was built under R version 3.6.3

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

lotarea <- train1$LotArea
exp_distr <- fitdistr(lotarea, 'exponential')
gamma <- exp_distr$estimate

exp_sample <- rexp(1000, rate = gamma)

lotarea <- lotarea %>% as.data.frame()
exp_sample <- exp_sample %>% as.data.frame()

lotarea %>% ggplot(aes(x = .)) + geom_histogram(binwidth = 1000)

exp_sample %>% ggplot(aes(x = .)) + geom_histogram(binwidth = 1000)

# The two distributions do not appear to be a strong fit. The empirical distribution has fewer values near 0, a steeper drop off from the peak, and a longer right tail than the sampling distribution.

emp_mean <- mean(lotarea$.)
emp_sd <- sd(lotarea$.)
emp_se <- emp_sd/sqrt(nrow(lotarea))
zscore <- c(1, 1) * qnorm(c(.025, .975))
conf <- emp_mean + zscore * emp_se

print(conf)

## [1] 10004.84 11028.81

# The 95% confidence interval for the empirical mean is 10005 - 11029.

distr_5 <- qexp(.05, rate = gamma)
distr_95 <- qexp(.95, rate = gamma)

emp_5 <- quantile(lotarea$., .05)
emp_95 <- quantile(lotarea$., .95)

print(c(distr_5, distr_95))

## [1]   539.4428 31505.6013

print(c(emp_5, emp_95))

##       5%      95% 
##  3311.70 17401.15

# Observing the distribution quantiles and the empirical quantiles confirms that the distribution is not a good fit to the data. The empirical data has much fewer observations near 0 and much fewer observations above 18000.

library(caret)

## Warning: package 'caret' was built under R version 3.6.3

## Loading required package: lattice

## Warning: package 'lattice' was built under R version 3.6.3

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

train1 <- train1[ , colSums(is.na(train1)) == 0]

features <- train1 %>% dplyr::select(-SalePrice)
feature_names <- names(features)

train1 <- train1 %>% dplyr::select(-Id)

model <- train(SalePrice ~ ., data = train1, method = 'glmnet', trControl = trainControl('cv', number = 5), PreProcess = c('scale', 'center'), tuneLength = 10)

test1 <- read.csv('data/house-prices-advanced-regression-techniques/test.csv')
test_predict <- test1 %>% dplyr::select(feature_names)

## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(feature_names)` instead of `feature_names` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

test_na <- test_predict[rowSums(is.na(test_predict)) != 0, ]
na_df <- data.frame('Id' = test_na$Id, 'SalePrice' = rep( median(train1$SalePrice, 12)))

test_predict <- test_predict[rowSums(is.na(test_predict)) == 0, ]

test_predict1 <- test_predict %>% dplyr::select(-Id)

y_pred <- predict(model, test_predict1)

pred_df <- data.frame('Id' = test_predict$Id, 'SalePrice' = y_pred)
pred_df <- rbind(pred_df, na_df)
full_df <- pred_df[order(pred_df$Id), ]


write.csv(pred_df, 'data/preds.csv', row.names = F)

In the above code I had initialized an elasticNet linear regression model and have had caret optimize hyperparameters and perform cross validation. I also had caret scale and center any numeric variables to avoid having scale issues when calculating distance.

The cross-validation set alpha = 0.1 (meaning it is much closer to ridge regression than lasso regression) and gamma of 4411 (which seems to indicate some strong regularization).

SBellows_605_Final