Load libraries

library(dplyr)
library(tidyr)
library(tibble)
library(ggplot2)
library(broom)
library(knitr)

Problem 1

N <- 6
X <- runif(10000, 1, N)
Y <- rnorm(10000, (N+1)/2, (N+1)/2)

x <- median(X)
y <- unname(quantile(Y)[2])

### a. The probability that X is greater than the median of X given X is greater than the first quartile of Y
mean(X>x)*mean(X>y)/mean(X>y)
## [1] 0.5
### b. The probability that X is greater than the median of X and Y is greater than the first quartile of Y
mean(X>x)*mean(Y>y)
## [1] 0.375
### c. The probability that X is less than the median of X given X is greater than the first quartile of Y
mean(X<x)*mean(X>y)/mean(X>y)
## [1] 0.5
### Two-Way Table:
df <- tibble(rowname = c("X>x", "X<=x"), 
             `Y>y`=c(mean(Y>y)*mean(X>x), mean(Y>y)*mean(X<=x)), 
             `Y<=y`=c(mean(Y<=y)*mean(X>x), mean(Y<=y)*mean(X<=x))) %>% 
  column_to_rownames()
df %>% kable()
Y>y Y<=y
X>x 0.375 0.125
X<=x 0.375 0.125
# Can see that the upper left value is 0.375, which is the joint probability.
# Can also see that the marginal prob for first row is 0.5 and for first column is 0.75. And 0.5*0.75 = 0.375

### Independence tests
fisher.test(x = df %>% as.matrix()*10000)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  df %>% as.matrix() * 10000
## p-value = 1
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.9124854 1.0959080
## sample estimates:
## odds ratio 
##          1
chisq.test(x = df %>% as.matrix()*10000, p = df %>% as.matrix())
## 
##  Pearson's Chi-squared test
## 
## data:  df %>% as.matrix() * 10000
## X-squared = 0, df = 1, p-value = 1
# Although the Fisher test may be more appropriate here, there is no difference in result. i.e., Both tests give the same p-value of 1, indicating that there is not evidence of an association. (In other words, there is no reason to reject that they are independent.)


Problem 2

Kaggle house prices competition

### Load data from Github
train_df <- read.csv('https://raw.githubusercontent.com/mehtablocker/cuny_605/master/data_files/train.csv')
test_df <- read.csv('https://raw.githubusercontent.com/mehtablocker/cuny_605/master/data_files/test.csv')
train_df %>% head() %>% kable()
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
1 60 RL 65 8450 Pave NA Reg Lvl AllPub Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2003 2003 Gable CompShg VinylSd VinylSd BrkFace 196 Gd TA PConc Gd TA No GLQ 706 Unf 0 150 856 GasA Ex Y SBrkr 856 854 0 1710 1 0 2 1 3 1 Gd 8 Typ 0 NA Attchd 2003 RFn 2 548 TA TA Y 0 61 0 0 0 0 NA NA NA 0 2 2008 WD Normal 208500
2 20 RL 80 9600 Pave NA Reg Lvl AllPub FR2 Gtl Veenker Feedr Norm 1Fam 1Story 6 8 1976 1976 Gable CompShg MetalSd MetalSd None 0 TA TA CBlock Gd TA Gd ALQ 978 Unf 0 284 1262 GasA Ex Y SBrkr 1262 0 0 1262 0 1 2 0 3 1 TA 6 Typ 1 TA Attchd 1976 RFn 2 460 TA TA Y 298 0 0 0 0 0 NA NA NA 0 5 2007 WD Normal 181500
3 60 RL 68 11250 Pave NA IR1 Lvl AllPub Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2001 2002 Gable CompShg VinylSd VinylSd BrkFace 162 Gd TA PConc Gd TA Mn GLQ 486 Unf 0 434 920 GasA Ex Y SBrkr 920 866 0 1786 1 0 2 1 3 1 Gd 6 Typ 1 TA Attchd 2001 RFn 2 608 TA TA Y 0 42 0 0 0 0 NA NA NA 0 9 2008 WD Normal 223500
4 70 RL 60 9550 Pave NA IR1 Lvl AllPub Corner Gtl Crawfor Norm Norm 1Fam 2Story 7 5 1915 1970 Gable CompShg Wd Sdng Wd Shng None 0 TA TA BrkTil TA Gd No ALQ 216 Unf 0 540 756 GasA Gd Y SBrkr 961 756 0 1717 1 0 1 0 3 1 Gd 7 Typ 1 Gd Detchd 1998 Unf 3 642 TA TA Y 0 35 272 0 0 0 NA NA NA 0 2 2006 WD Abnorml 140000
5 60 RL 84 14260 Pave NA IR1 Lvl AllPub FR2 Gtl NoRidge Norm Norm 1Fam 2Story 8 5 2000 2000 Gable CompShg VinylSd VinylSd BrkFace 350 Gd TA PConc Gd TA Av GLQ 655 Unf 0 490 1145 GasA Ex Y SBrkr 1145 1053 0 2198 1 0 2 1 4 1 Gd 9 Typ 1 TA Attchd 2000 RFn 3 836 TA TA Y 192 84 0 0 0 0 NA NA NA 0 12 2008 WD Normal 250000
6 50 RL 85 14115 Pave NA IR1 Lvl AllPub Inside Gtl Mitchel Norm Norm 1Fam 1.5Fin 5 5 1993 1995 Gable CompShg VinylSd VinylSd None 0 TA TA Wood Gd TA No GLQ 732 Unf 0 64 796 GasA Ex Y SBrkr 796 566 0 1362 1 0 1 1 1 1 TA 5 Typ 0 NA Attchd 1993 Unf 2 480 TA TA Y 40 30 0 320 0 0 NA MnPrv Shed 700 10 2009 WD Normal 143000
### Summary and histogram of target variable
train_df %>% select(SalePrice) %>% summary()
##    SalePrice     
##  Min.   : 34900  
##  1st Qu.:129975  
##  Median :163000  
##  Mean   :180921  
##  3rd Qu.:214000  
##  Max.   :755000
train_df %>% ggplot(aes(SalePrice)) + geom_histogram(bins=20)

### Scatterplot matrix of select variables
train_df %>% select(YrSold, GrLivArea, SalePrice) %>% plot()

### Correlation matrix of select variables
cor_mat <- train_df %>% select(YrSold, GrLivArea, SalePrice) %>% cor()
round(cor_mat, 2)
##           YrSold GrLivArea SalePrice
## YrSold      1.00     -0.04     -0.03
## GrLivArea  -0.04      1.00      0.71
## SalePrice  -0.03      0.71      1.00
### Confidence intervals for correlations
round(cor.test(train_df$GrLivArea, train_df$SalePrice, conf.level = 0.8)$conf.int[1:2], 2)
## [1] 0.69 0.72
round(cor.test(train_df$YrSold, train_df$SalePrice, conf.level = 0.8)$conf.int[1:2], 2)
## [1] -0.06  0.00
round(cor.test(train_df$YrSold, train_df$GrLivArea, conf.level = 0.8)$conf.int[1:2], 2)
## [1] -0.07  0.00
# Zero is not in the interval for the GrLivArea/SalePrice pair, but it is for the other two, indicating that we cannot be sure the correlation was not due to chance alone.

# In general I think it is very important to be cautious of familywise error, i.e., finding spurious correlations, particularly when dealing with over 75 different variables, The higher the number of variables, the higher the probability of finding correlations by chance. We can mitigate this risk by using domain knowledge for variable selection and/or applying sensible Bayesian priors.

### Linear Algebra
prec_mat <- solve(cor_mat)
prec_mat
##                YrSold   GrLivArea    SalePrice
## YrSold    1.001354525  0.03224312  0.006113496
## GrLivArea 0.032243122  2.00966991 -1.423168733
## SalePrice 0.006113496 -1.42316873  2.008669018
round(cor_mat %*% prec_mat, 2)
##           YrSold GrLivArea SalePrice
## YrSold         1         0         0
## GrLivArea      0         1         0
## SalePrice      0         0         1
round(prec_mat %*% cor_mat, 2)
##           YrSold GrLivArea SalePrice
## YrSold         1         0         0
## GrLivArea      0         1         0
## SalePrice      0         0         1
LU_decomp <- function(A){
  
  ### Check to make sure matrix is not singular, rectangular, or comprised of zeros on pivot
  if(sum(diag(A)==0)>0 | qr(A)$rank<max(nrow(A), ncol(A))) {
    message("Error: This matrix cannot be decomposed.")
  } else {
    U <- A
    # Get U by going column by column and algebraically putting zeros in all positions below the diagonal
    for (j in 1:(ncol(U)-1)){
      for (i in (j+1):nrow(U)){
        if (U[i,j]!=0){U[i,] <- U[i, ] - U[j, ]*(U[i,j]/U[j,j])}
      }
    }
    
    # Get L by restating L U = A  as  U_inv L_inv = A_inv  ,solving for L_inv, and then taking inverse to get L 
    U_inv <- solve(U)
    A_inv <- solve(A)
    L_inv <- solve(U_inv, A_inv)
    L <- solve(L_inv)
    list(L=round(L,2) , U=round(U,2))
  }
}

LU_decomp(cor_mat)
## $L
##           YrSold GrLivArea SalePrice
## YrSold      1.00      0.00         0
## GrLivArea  -0.04      1.00         0
## SalePrice  -0.03      0.71         1
## 
## $U
##           YrSold GrLivArea SalePrice
## YrSold         1     -0.04     -0.03
## GrLivArea      0      1.00      0.71
## SalePrice      0      0.00      0.50
LU_decomp(prec_mat)
## $L
##           YrSold GrLivArea SalePrice
## YrSold      1.00      0.00         0
## GrLivArea   0.03      1.00         0
## SalePrice   0.01     -0.71         1
## 
## $U
##           YrSold GrLivArea SalePrice
## YrSold         1      0.03      0.01
## GrLivArea      0      2.01     -1.42
## SalePrice      0      0.00      1.00
### Check to see which variables are right skewed
num_inds <- which(sapply(1:ncol(train_df), function(x) class(train_df[, x]))!="factor") #index of numerical variables
train_df %>% 
  select(num_inds) %>% 
  gather(Id) %>% 
  filter(!is.na(value)) %>% 
  ggplot(aes(x=value)) + geom_histogram(bins=20) + facet_wrap(~Id, scales = 'free_x') #histograms

### Fit exponential distribution to OpenPorchSF
lam <- MASS::fitdistr(train_df$OpenPorchSF, "exponential")$estimate %>% unname()
sim_vals <- rexp(nrow(train_df), lam)
train_df %>% 
  mutate(sim_OPSF = sim_vals) %>% 
  select(Id, sim_OPSF, OpenPorchSF) %>% 
  gather(Id) %>% 
  filter(!is.na(value)) %>% 
  ggplot(aes(x=value)) + geom_histogram(bins=20) + facet_wrap(~Id, scales = 'free_x') #compare histograms

### Theoretical 5th and 95th percentiles
qexp(p = c(0.05, 0.95), rate = lam)
## [1]   2.393359 139.781689
### Empirical percentiles
quantile(train_df$OpenPorchSF, c(0.05, 0.95))
##     5%    95% 
##   0.00 175.05
# The fit is clearly not perfect. The empirical distribution seems to have fatter tails.

### Empirical confidence interval for mean, using asymptotic CI based on Fisher information
xbar <- mean(train_df$OpenPorchSF)
z <- 1.96
n <- length(train_df$OpenPorchSF)
se <- sqrt(xbar/n)
c(xbar-1.96*se, xbar+1.96*se)
## [1] 46.30988 47.01067
### Multiple regression model
lm_fit <- lm(SalePrice ~ LotArea + Neighborhood + BldgType + OverallCond + GrLivArea + YrSold, data=train_df)
summary(lm_fit)
## 
## Call:
## lm(formula = SalePrice ~ LotArea + Neighborhood + BldgType + 
##     OverallCond + GrLivArea + YrSold, data = train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -320405  -19112   -1042   15121  275473 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          6.863e+05  1.550e+06   0.443 0.658066    
## LotArea              5.387e-01  1.163e-01   4.631 3.96e-06 ***
## NeighborhoodBlueste -4.555e+04  2.922e+04  -1.559 0.119310    
## NeighborhoodBrDale  -4.976e+04  1.468e+04  -3.390 0.000718 ***
## NeighborhoodBrkSide -8.464e+04  1.170e+04  -7.233 7.70e-13 ***
## NeighborhoodClearCr -5.075e+04  1.298e+04  -3.909 9.70e-05 ***
## NeighborhoodCollgCr -2.638e+04  1.078e+04  -2.447 0.014516 *  
## NeighborhoodCrawfor -4.603e+04  1.172e+04  -3.929 8.95e-05 ***
## NeighborhoodEdwards -8.317e+04  1.108e+04  -7.507 1.06e-13 ***
## NeighborhoodGilbert -4.502e+04  1.130e+04  -3.984 7.11e-05 ***
## NeighborhoodIDOTRR  -9.986e+04  1.231e+04  -8.110 1.08e-15 ***
## NeighborhoodMeadowV -5.926e+04  1.368e+04  -4.330 1.59e-05 ***
## NeighborhoodMitchel -4.997e+04  1.177e+04  -4.245 2.33e-05 ***
## NeighborhoodNAmes   -6.889e+04  1.080e+04  -6.381 2.38e-10 ***
## NeighborhoodNoRidge  3.041e+04  1.216e+04   2.501 0.012485 *  
## NeighborhoodNPkVill -3.220e+04  1.620e+04  -1.988 0.047001 *  
## NeighborhoodNridgHt  6.805e+04  1.095e+04   6.214 6.75e-10 ***
## NeighborhoodNWAmes  -6.052e+04  1.138e+04  -5.317 1.22e-07 ***
## NeighborhoodOldTown -1.010e+05  1.115e+04  -9.061  < 2e-16 ***
## NeighborhoodSawyer  -7.100e+04  1.146e+04  -6.197 7.53e-10 ***
## NeighborhoodSawyerW -4.202e+04  1.139e+04  -3.690 0.000233 ***
## NeighborhoodSomerst  3.370e+03  1.082e+04   0.312 0.755449    
## NeighborhoodStoneBr  6.733e+04  1.240e+04   5.430 6.63e-08 ***
## NeighborhoodSWISU   -1.083e+05  1.303e+04  -8.314  < 2e-16 ***
## NeighborhoodTimber  -9.946e+03  1.228e+04  -0.810 0.418217    
## NeighborhoodVeenker  4.321e+03  1.540e+04   0.281 0.779113    
## BldgType2fmCon      -2.221e+04  7.312e+03  -3.037 0.002432 ** 
## BldgTypeDuplex      -2.964e+04  5.703e+03  -5.198 2.31e-07 ***
## BldgTypeTwnhs       -5.041e+04  7.738e+03  -6.515 1.01e-10 ***
## BldgTypeTwnhsE      -2.363e+04  4.897e+03  -4.825 1.55e-06 ***
## OverallCond          6.870e+03  1.014e+03   6.778 1.78e-11 ***
## GrLivArea            7.452e+01  2.372e+00  31.412  < 2e-16 ***
## YrSold              -3.047e+02  7.723e+02  -0.395 0.693201    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38730 on 1427 degrees of freedom
## Multiple R-squared:  0.7675, Adjusted R-squared:  0.7623 
## F-statistic: 147.2 on 32 and 1427 DF,  p-value: < 2.2e-16
# Check residuals
lm_df <- augment(lm_fit)
    
# residual plot
lm_df %>% 
  ggplot(aes(x = .fitted, y = .resid)) + 
  geom_point() + 
  geom_hline(yintercept = 0)

# residual histogram
lm_df %>% 
  ggplot(aes(.resid)) + 
  geom_histogram(bins = 10)

# residual qqplot
lm_df %>% 
  ggplot(aes(sample = .resid)) + 
  stat_qq() + 
  stat_qq_line()

# Make predictions on test data
yhats <- predict(lm_fit, newdata = test_df)
yhat_df <- data.frame(Id = test_df$Id, SalePrice = yhats %>% unname())
yhat_df %>% head() %>% kable()
Id SalePrice
1461 119102.6
1462 152796.3
1463 191923.8
1464 194855.7
1465 249880.3
1466 191798.4


The predictions were entered into the competition under “mehtablocker” and received a score of 0.20281.