Problem 1
N <- 6
X <- runif(10000, 1, N)
Y <- rnorm(10000, (N+1)/2, (N+1)/2)
x <- median(X)
y <- unname(quantile(Y)[2])
### a. The probability that X is greater than the median of X given X is greater than the first quartile of Y
mean(X>x)*mean(X>y)/mean(X>y)
## [1] 0.5
### b. The probability that X is greater than the median of X and Y is greater than the first quartile of Y
mean(X>x)*mean(Y>y)
## [1] 0.375
### c. The probability that X is less than the median of X given X is greater than the first quartile of Y
mean(X<x)*mean(X>y)/mean(X>y)
## [1] 0.5
### Two-Way Table:
df <- tibble(rowname = c("X>x", "X<=x"),
`Y>y`=c(mean(Y>y)*mean(X>x), mean(Y>y)*mean(X<=x)),
`Y<=y`=c(mean(Y<=y)*mean(X>x), mean(Y<=y)*mean(X<=x))) %>%
column_to_rownames()
df %>% kable()
| X>x |
0.375 |
0.125 |
| X<=x |
0.375 |
0.125 |
# Can see that the upper left value is 0.375, which is the joint probability.
# Can also see that the marginal prob for first row is 0.5 and for first column is 0.75. And 0.5*0.75 = 0.375
### Independence tests
fisher.test(x = df %>% as.matrix()*10000)
##
## Fisher's Exact Test for Count Data
##
## data: df %>% as.matrix() * 10000
## p-value = 1
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.9124854 1.0959080
## sample estimates:
## odds ratio
## 1
chisq.test(x = df %>% as.matrix()*10000, p = df %>% as.matrix())
##
## Pearson's Chi-squared test
##
## data: df %>% as.matrix() * 10000
## X-squared = 0, df = 1, p-value = 1
# Although the Fisher test may be more appropriate here, there is no difference in result. i.e., Both tests give the same p-value of 1, indicating that there is not evidence of an association. (In other words, there is no reason to reject that they are independent.)
Problem 2
Kaggle house prices competition
### Load data from Github
train_df <- read.csv('https://raw.githubusercontent.com/mehtablocker/cuny_605/master/data_files/train.csv')
test_df <- read.csv('https://raw.githubusercontent.com/mehtablocker/cuny_605/master/data_files/test.csv')
train_df %>% head() %>% kable()
| 1 |
60 |
RL |
65 |
8450 |
Pave |
NA |
Reg |
Lvl |
AllPub |
Inside |
Gtl |
CollgCr |
Norm |
Norm |
1Fam |
2Story |
7 |
5 |
2003 |
2003 |
Gable |
CompShg |
VinylSd |
VinylSd |
BrkFace |
196 |
Gd |
TA |
PConc |
Gd |
TA |
No |
GLQ |
706 |
Unf |
0 |
150 |
856 |
GasA |
Ex |
Y |
SBrkr |
856 |
854 |
0 |
1710 |
1 |
0 |
2 |
1 |
3 |
1 |
Gd |
8 |
Typ |
0 |
NA |
Attchd |
2003 |
RFn |
2 |
548 |
TA |
TA |
Y |
0 |
61 |
0 |
0 |
0 |
0 |
NA |
NA |
NA |
0 |
2 |
2008 |
WD |
Normal |
208500 |
| 2 |
20 |
RL |
80 |
9600 |
Pave |
NA |
Reg |
Lvl |
AllPub |
FR2 |
Gtl |
Veenker |
Feedr |
Norm |
1Fam |
1Story |
6 |
8 |
1976 |
1976 |
Gable |
CompShg |
MetalSd |
MetalSd |
None |
0 |
TA |
TA |
CBlock |
Gd |
TA |
Gd |
ALQ |
978 |
Unf |
0 |
284 |
1262 |
GasA |
Ex |
Y |
SBrkr |
1262 |
0 |
0 |
1262 |
0 |
1 |
2 |
0 |
3 |
1 |
TA |
6 |
Typ |
1 |
TA |
Attchd |
1976 |
RFn |
2 |
460 |
TA |
TA |
Y |
298 |
0 |
0 |
0 |
0 |
0 |
NA |
NA |
NA |
0 |
5 |
2007 |
WD |
Normal |
181500 |
| 3 |
60 |
RL |
68 |
11250 |
Pave |
NA |
IR1 |
Lvl |
AllPub |
Inside |
Gtl |
CollgCr |
Norm |
Norm |
1Fam |
2Story |
7 |
5 |
2001 |
2002 |
Gable |
CompShg |
VinylSd |
VinylSd |
BrkFace |
162 |
Gd |
TA |
PConc |
Gd |
TA |
Mn |
GLQ |
486 |
Unf |
0 |
434 |
920 |
GasA |
Ex |
Y |
SBrkr |
920 |
866 |
0 |
1786 |
1 |
0 |
2 |
1 |
3 |
1 |
Gd |
6 |
Typ |
1 |
TA |
Attchd |
2001 |
RFn |
2 |
608 |
TA |
TA |
Y |
0 |
42 |
0 |
0 |
0 |
0 |
NA |
NA |
NA |
0 |
9 |
2008 |
WD |
Normal |
223500 |
| 4 |
70 |
RL |
60 |
9550 |
Pave |
NA |
IR1 |
Lvl |
AllPub |
Corner |
Gtl |
Crawfor |
Norm |
Norm |
1Fam |
2Story |
7 |
5 |
1915 |
1970 |
Gable |
CompShg |
Wd Sdng |
Wd Shng |
None |
0 |
TA |
TA |
BrkTil |
TA |
Gd |
No |
ALQ |
216 |
Unf |
0 |
540 |
756 |
GasA |
Gd |
Y |
SBrkr |
961 |
756 |
0 |
1717 |
1 |
0 |
1 |
0 |
3 |
1 |
Gd |
7 |
Typ |
1 |
Gd |
Detchd |
1998 |
Unf |
3 |
642 |
TA |
TA |
Y |
0 |
35 |
272 |
0 |
0 |
0 |
NA |
NA |
NA |
0 |
2 |
2006 |
WD |
Abnorml |
140000 |
| 5 |
60 |
RL |
84 |
14260 |
Pave |
NA |
IR1 |
Lvl |
AllPub |
FR2 |
Gtl |
NoRidge |
Norm |
Norm |
1Fam |
2Story |
8 |
5 |
2000 |
2000 |
Gable |
CompShg |
VinylSd |
VinylSd |
BrkFace |
350 |
Gd |
TA |
PConc |
Gd |
TA |
Av |
GLQ |
655 |
Unf |
0 |
490 |
1145 |
GasA |
Ex |
Y |
SBrkr |
1145 |
1053 |
0 |
2198 |
1 |
0 |
2 |
1 |
4 |
1 |
Gd |
9 |
Typ |
1 |
TA |
Attchd |
2000 |
RFn |
3 |
836 |
TA |
TA |
Y |
192 |
84 |
0 |
0 |
0 |
0 |
NA |
NA |
NA |
0 |
12 |
2008 |
WD |
Normal |
250000 |
| 6 |
50 |
RL |
85 |
14115 |
Pave |
NA |
IR1 |
Lvl |
AllPub |
Inside |
Gtl |
Mitchel |
Norm |
Norm |
1Fam |
1.5Fin |
5 |
5 |
1993 |
1995 |
Gable |
CompShg |
VinylSd |
VinylSd |
None |
0 |
TA |
TA |
Wood |
Gd |
TA |
No |
GLQ |
732 |
Unf |
0 |
64 |
796 |
GasA |
Ex |
Y |
SBrkr |
796 |
566 |
0 |
1362 |
1 |
0 |
1 |
1 |
1 |
1 |
TA |
5 |
Typ |
0 |
NA |
Attchd |
1993 |
Unf |
2 |
480 |
TA |
TA |
Y |
40 |
30 |
0 |
320 |
0 |
0 |
NA |
MnPrv |
Shed |
700 |
10 |
2009 |
WD |
Normal |
143000 |
### Summary and histogram of target variable
train_df %>% select(SalePrice) %>% summary()
## SalePrice
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
train_df %>% ggplot(aes(SalePrice)) + geom_histogram(bins=20)

### Scatterplot matrix of select variables
train_df %>% select(YrSold, GrLivArea, SalePrice) %>% plot()

### Correlation matrix of select variables
cor_mat <- train_df %>% select(YrSold, GrLivArea, SalePrice) %>% cor()
round(cor_mat, 2)
## YrSold GrLivArea SalePrice
## YrSold 1.00 -0.04 -0.03
## GrLivArea -0.04 1.00 0.71
## SalePrice -0.03 0.71 1.00
### Confidence intervals for correlations
round(cor.test(train_df$GrLivArea, train_df$SalePrice, conf.level = 0.8)$conf.int[1:2], 2)
## [1] 0.69 0.72
round(cor.test(train_df$YrSold, train_df$SalePrice, conf.level = 0.8)$conf.int[1:2], 2)
## [1] -0.06 0.00
round(cor.test(train_df$YrSold, train_df$GrLivArea, conf.level = 0.8)$conf.int[1:2], 2)
## [1] -0.07 0.00
# Zero is not in the interval for the GrLivArea/SalePrice pair, but it is for the other two, indicating that we cannot be sure the correlation was not due to chance alone.
# In general I think it is very important to be cautious of familywise error, i.e., finding spurious correlations, particularly when dealing with over 75 different variables, The higher the number of variables, the higher the probability of finding correlations by chance. We can mitigate this risk by using domain knowledge for variable selection and/or applying sensible Bayesian priors.
### Linear Algebra
prec_mat <- solve(cor_mat)
prec_mat
## YrSold GrLivArea SalePrice
## YrSold 1.001354525 0.03224312 0.006113496
## GrLivArea 0.032243122 2.00966991 -1.423168733
## SalePrice 0.006113496 -1.42316873 2.008669018
round(cor_mat %*% prec_mat, 2)
## YrSold GrLivArea SalePrice
## YrSold 1 0 0
## GrLivArea 0 1 0
## SalePrice 0 0 1
round(prec_mat %*% cor_mat, 2)
## YrSold GrLivArea SalePrice
## YrSold 1 0 0
## GrLivArea 0 1 0
## SalePrice 0 0 1
LU_decomp <- function(A){
### Check to make sure matrix is not singular, rectangular, or comprised of zeros on pivot
if(sum(diag(A)==0)>0 | qr(A)$rank<max(nrow(A), ncol(A))) {
message("Error: This matrix cannot be decomposed.")
} else {
U <- A
# Get U by going column by column and algebraically putting zeros in all positions below the diagonal
for (j in 1:(ncol(U)-1)){
for (i in (j+1):nrow(U)){
if (U[i,j]!=0){U[i,] <- U[i, ] - U[j, ]*(U[i,j]/U[j,j])}
}
}
# Get L by restating L U = A as U_inv L_inv = A_inv ,solving for L_inv, and then taking inverse to get L
U_inv <- solve(U)
A_inv <- solve(A)
L_inv <- solve(U_inv, A_inv)
L <- solve(L_inv)
list(L=round(L,2) , U=round(U,2))
}
}
LU_decomp(cor_mat)
## $L
## YrSold GrLivArea SalePrice
## YrSold 1.00 0.00 0
## GrLivArea -0.04 1.00 0
## SalePrice -0.03 0.71 1
##
## $U
## YrSold GrLivArea SalePrice
## YrSold 1 -0.04 -0.03
## GrLivArea 0 1.00 0.71
## SalePrice 0 0.00 0.50
LU_decomp(prec_mat)
## $L
## YrSold GrLivArea SalePrice
## YrSold 1.00 0.00 0
## GrLivArea 0.03 1.00 0
## SalePrice 0.01 -0.71 1
##
## $U
## YrSold GrLivArea SalePrice
## YrSold 1 0.03 0.01
## GrLivArea 0 2.01 -1.42
## SalePrice 0 0.00 1.00
### Check to see which variables are right skewed
num_inds <- which(sapply(1:ncol(train_df), function(x) class(train_df[, x]))!="factor") #index of numerical variables
train_df %>%
select(num_inds) %>%
gather(Id) %>%
filter(!is.na(value)) %>%
ggplot(aes(x=value)) + geom_histogram(bins=20) + facet_wrap(~Id, scales = 'free_x') #histograms

### Fit exponential distribution to OpenPorchSF
lam <- MASS::fitdistr(train_df$OpenPorchSF, "exponential")$estimate %>% unname()
sim_vals <- rexp(nrow(train_df), lam)
train_df %>%
mutate(sim_OPSF = sim_vals) %>%
select(Id, sim_OPSF, OpenPorchSF) %>%
gather(Id) %>%
filter(!is.na(value)) %>%
ggplot(aes(x=value)) + geom_histogram(bins=20) + facet_wrap(~Id, scales = 'free_x') #compare histograms

### Theoretical 5th and 95th percentiles
qexp(p = c(0.05, 0.95), rate = lam)
## [1] 2.393359 139.781689
### Empirical percentiles
quantile(train_df$OpenPorchSF, c(0.05, 0.95))
## 5% 95%
## 0.00 175.05
# The fit is clearly not perfect. The empirical distribution seems to have fatter tails.
### Empirical confidence interval for mean, using asymptotic CI based on Fisher information
xbar <- mean(train_df$OpenPorchSF)
z <- 1.96
n <- length(train_df$OpenPorchSF)
se <- sqrt(xbar/n)
c(xbar-1.96*se, xbar+1.96*se)
## [1] 46.30988 47.01067
### Multiple regression model
lm_fit <- lm(SalePrice ~ LotArea + Neighborhood + BldgType + OverallCond + GrLivArea + YrSold, data=train_df)
summary(lm_fit)
##
## Call:
## lm(formula = SalePrice ~ LotArea + Neighborhood + BldgType +
## OverallCond + GrLivArea + YrSold, data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -320405 -19112 -1042 15121 275473
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.863e+05 1.550e+06 0.443 0.658066
## LotArea 5.387e-01 1.163e-01 4.631 3.96e-06 ***
## NeighborhoodBlueste -4.555e+04 2.922e+04 -1.559 0.119310
## NeighborhoodBrDale -4.976e+04 1.468e+04 -3.390 0.000718 ***
## NeighborhoodBrkSide -8.464e+04 1.170e+04 -7.233 7.70e-13 ***
## NeighborhoodClearCr -5.075e+04 1.298e+04 -3.909 9.70e-05 ***
## NeighborhoodCollgCr -2.638e+04 1.078e+04 -2.447 0.014516 *
## NeighborhoodCrawfor -4.603e+04 1.172e+04 -3.929 8.95e-05 ***
## NeighborhoodEdwards -8.317e+04 1.108e+04 -7.507 1.06e-13 ***
## NeighborhoodGilbert -4.502e+04 1.130e+04 -3.984 7.11e-05 ***
## NeighborhoodIDOTRR -9.986e+04 1.231e+04 -8.110 1.08e-15 ***
## NeighborhoodMeadowV -5.926e+04 1.368e+04 -4.330 1.59e-05 ***
## NeighborhoodMitchel -4.997e+04 1.177e+04 -4.245 2.33e-05 ***
## NeighborhoodNAmes -6.889e+04 1.080e+04 -6.381 2.38e-10 ***
## NeighborhoodNoRidge 3.041e+04 1.216e+04 2.501 0.012485 *
## NeighborhoodNPkVill -3.220e+04 1.620e+04 -1.988 0.047001 *
## NeighborhoodNridgHt 6.805e+04 1.095e+04 6.214 6.75e-10 ***
## NeighborhoodNWAmes -6.052e+04 1.138e+04 -5.317 1.22e-07 ***
## NeighborhoodOldTown -1.010e+05 1.115e+04 -9.061 < 2e-16 ***
## NeighborhoodSawyer -7.100e+04 1.146e+04 -6.197 7.53e-10 ***
## NeighborhoodSawyerW -4.202e+04 1.139e+04 -3.690 0.000233 ***
## NeighborhoodSomerst 3.370e+03 1.082e+04 0.312 0.755449
## NeighborhoodStoneBr 6.733e+04 1.240e+04 5.430 6.63e-08 ***
## NeighborhoodSWISU -1.083e+05 1.303e+04 -8.314 < 2e-16 ***
## NeighborhoodTimber -9.946e+03 1.228e+04 -0.810 0.418217
## NeighborhoodVeenker 4.321e+03 1.540e+04 0.281 0.779113
## BldgType2fmCon -2.221e+04 7.312e+03 -3.037 0.002432 **
## BldgTypeDuplex -2.964e+04 5.703e+03 -5.198 2.31e-07 ***
## BldgTypeTwnhs -5.041e+04 7.738e+03 -6.515 1.01e-10 ***
## BldgTypeTwnhsE -2.363e+04 4.897e+03 -4.825 1.55e-06 ***
## OverallCond 6.870e+03 1.014e+03 6.778 1.78e-11 ***
## GrLivArea 7.452e+01 2.372e+00 31.412 < 2e-16 ***
## YrSold -3.047e+02 7.723e+02 -0.395 0.693201
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38730 on 1427 degrees of freedom
## Multiple R-squared: 0.7675, Adjusted R-squared: 0.7623
## F-statistic: 147.2 on 32 and 1427 DF, p-value: < 2.2e-16
# Check residuals
lm_df <- augment(lm_fit)
# residual plot
lm_df %>%
ggplot(aes(x = .fitted, y = .resid)) +
geom_point() +
geom_hline(yintercept = 0)

# residual histogram
lm_df %>%
ggplot(aes(.resid)) +
geom_histogram(bins = 10)

# residual qqplot
lm_df %>%
ggplot(aes(sample = .resid)) +
stat_qq() +
stat_qq_line()

# Make predictions on test data
yhats <- predict(lm_fit, newdata = test_df)
yhat_df <- data.frame(Id = test_df$Id, SalePrice = yhats %>% unname())
yhat_df %>% head() %>% kable()
| 1461 |
119102.6 |
| 1462 |
152796.3 |
| 1463 |
191923.8 |
| 1464 |
194855.7 |
| 1465 |
249880.3 |
| 1466 |
191798.4 |
The predictions were entered into the competition under “mehtablocker” and received a score of 0.20281.