CUNY 605

Load libraries

library(dplyr)
library(tidyr)
library(tibble)
library(ggplot2)
library(broom)
library(knitr)

Problem 1

N <- 6
X <- runif(10000, 1, N)
Y <- rnorm(10000, (N+1)/2, (N+1)/2)

x <- median(X)
y <- unname(quantile(Y)[2])

### a. The probability that X is greater than the median of X given X is greater than the first quartile of Y
mean(X>x)*mean(X>y)/mean(X>y)

## [1] 0.5

### b. The probability that X is greater than the median of X and Y is greater than the first quartile of Y
mean(X>x)*mean(Y>y)

## [1] 0.375

### c. The probability that X is less than the median of X given X is greater than the first quartile of Y
mean(X<x)*mean(X>y)/mean(X>y)

## [1] 0.5

### Two-Way Table:
df <- tibble(rowname = c("X>x", "X<=x"), 
             `Y>y`=c(mean(Y>y)*mean(X>x), mean(Y>y)*mean(X<=x)), 
             `Y<=y`=c(mean(Y<=y)*mean(X>x), mean(Y<=y)*mean(X<=x))) %>% 
  column_to_rownames()
df %>% kable()

	Y>y	Y<=y
X>x	0.375	0.125
X<=x	0.375	0.125

# Can see that the upper left value is 0.375, which is the joint probability.
# Can also see that the marginal prob for first row is 0.5 and for first column is 0.75. And 0.5*0.75 = 0.375

### Independence tests
fisher.test(x = df %>% as.matrix()*10000)

## 
##  Fisher's Exact Test for Count Data
## 
## data:  df %>% as.matrix() * 10000
## p-value = 1
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.9124854 1.0959080
## sample estimates:
## odds ratio 
##          1

chisq.test(x = df %>% as.matrix()*10000, p = df %>% as.matrix())

## 
##  Pearson's Chi-squared test
## 
## data:  df %>% as.matrix() * 10000
## X-squared = 0, df = 1, p-value = 1

# Although the Fisher test may be more appropriate here, there is no difference in result. i.e., Both tests give the same p-value of 1, indicating that there is not evidence of an association. (In other words, there is no reason to reject that they are independent.)

Problem 2

Kaggle house prices competition

### Load data from Github
train_df <- read.csv('https://raw.githubusercontent.com/mehtablocker/cuny_605/master/data_files/train.csv')
test_df <- read.csv('https://raw.githubusercontent.com/mehtablocker/cuny_605/master/data_files/test.csv')
train_df %>% head() %>% kable()

Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	LotConfig	LandSlope	Neighborhood	Condition1	Condition2	BldgType	HouseStyle	OverallQual	OverallCond	YearBuilt	YearRemodAdd	RoofStyle	RoofMatl	Exterior1st	Exterior2nd	MasVnrType	MasVnrArea	ExterQual	ExterCond	Foundation	BsmtQual	BsmtCond	BsmtExposure	BsmtFinType1	BsmtFinSF1	BsmtFinType2	BsmtUnfSF	TotalBsmtSF	Heating	HeatingQC	CentralAir	Electrical	X1stFlrSF	X2ndFlrSF	GrLivArea	BsmtFullBath	BsmtHalfBath	FullBath	HalfBath	BedroomAbvGr	KitchenAbvGr	KitchenQual	TotRmsAbvGrd	Functional	Fireplaces	FireplaceQu	GarageType	GarageYrBlt	GarageFinish	GarageCars	GarageArea	GarageQual	GarageCond	PavedDrive	WoodDeckSF	OpenPorchSF	EnclosedPorch	X3SsnPorch	PoolQC	Fence	MiscFeature	MiscVal	MoSold	YrSold	SaleType	SaleCondition	SalePrice
1	60	RL	65	8450	Pave	NA	Reg	Lvl	AllPub	Inside	Gtl	CollgCr	Norm	Norm	1Fam	2Story	7	5	2003	2003	Gable	CompShg	VinylSd	VinylSd	BrkFace	196	Gd	TA	PConc	Gd	TA	No	GLQ	706	Unf	150	856	GasA	Ex	Y	SBrkr	856	854	1710	1	0	2	1	3	1	Gd	8	Typ	0	NA	Attchd	2003	RFn	2	548	TA	TA	Y	0	61	0	0	NA	NA	NA	0	2	2008	WD	Normal	208500
2	20	RL	80	9600	Pave	NA	Reg	Lvl	AllPub	FR2	Gtl	Veenker	Feedr	Norm	1Fam	1Story	6	8	1976	1976	Gable	CompShg	MetalSd	MetalSd	None	0	TA	TA	CBlock	Gd	TA	Gd	ALQ	978	Unf	284	1262	GasA	Ex	Y	SBrkr	1262	0	1262	0	1	2	0	3	1	TA	6	Typ	1	TA	Attchd	1976	RFn	2	460	TA	TA	Y	298	0	0	0	NA	NA	NA	0	5	2007	WD	Normal	181500
3	60	RL	68	11250	Pave	NA	IR1	Lvl	AllPub	Inside	Gtl	CollgCr	Norm	Norm	1Fam	2Story	7	5	2001	2002	Gable	CompShg	VinylSd	VinylSd	BrkFace	162	Gd	TA	PConc	Gd	TA	Mn	GLQ	486	Unf	434	920	GasA	Ex	Y	SBrkr	920	866	1786	1	0	2	1	3	1	Gd	6	Typ	1	TA	Attchd	2001	RFn	2	608	TA	TA	Y	0	42	0	0	NA	NA	NA	0	9	2008	WD	Normal	223500
4	70	RL	60	9550	Pave	NA	IR1	Lvl	AllPub	Corner	Gtl	Crawfor	Norm	Norm	1Fam	2Story	7	5	1915	1970	Gable	CompShg	Wd Sdng	Wd Shng	None	0	TA	TA	BrkTil	TA	Gd	No	ALQ	216	Unf	540	756	GasA	Gd	Y	SBrkr	961	756	1717	1	0	1	0	3	1	Gd	7	Typ	1	Gd	Detchd	1998	Unf	3	642	TA	TA	Y	0	35	272	0	NA	NA	NA	0	2	2006	WD	Abnorml	140000
5	60	RL	84	14260	Pave	NA	IR1	Lvl	AllPub	FR2	Gtl	NoRidge	Norm	Norm	1Fam	2Story	8	5	2000	2000	Gable	CompShg	VinylSd	VinylSd	BrkFace	350	Gd	TA	PConc	Gd	TA	Av	GLQ	655	Unf	490	1145	GasA	Ex	Y	SBrkr	1145	1053	2198	1	0	2	1	4	1	Gd	9	Typ	1	TA	Attchd	2000	RFn	3	836	TA	TA	Y	192	84	0	0	NA	NA	NA	0	12	2008	WD	Normal	250000
6	50	RL	85	14115	Pave	NA	IR1	Lvl	AllPub	Inside	Gtl	Mitchel	Norm	Norm	1Fam	1.5Fin	5	5	1993	1995	Gable	CompShg	VinylSd	VinylSd	None	0	TA	TA	Wood	Gd	TA	No	GLQ	732	Unf	64	796	GasA	Ex	Y	SBrkr	796	566	1362	1	0	1	1	1	1	TA	5	Typ	0	NA	Attchd	1993	Unf	2	480	TA	TA	Y	40	30	0	320	NA	MnPrv	Shed	700	10	2009	WD	Normal	143000

### Summary and histogram of target variable
train_df %>% select(SalePrice) %>% summary()

##    SalePrice     
##  Min.   : 34900  
##  1st Qu.:129975  
##  Median :163000  
##  Mean   :180921  
##  3rd Qu.:214000  
##  Max.   :755000

train_df %>% ggplot(aes(SalePrice)) + geom_histogram(bins=20)

### Scatterplot matrix of select variables
train_df %>% select(YrSold, GrLivArea, SalePrice) %>% plot()

### Correlation matrix of select variables
cor_mat <- train_df %>% select(YrSold, GrLivArea, SalePrice) %>% cor()
round(cor_mat, 2)

##           YrSold GrLivArea SalePrice
## YrSold      1.00     -0.04     -0.03
## GrLivArea  -0.04      1.00      0.71
## SalePrice  -0.03      0.71      1.00

### Confidence intervals for correlations
round(cor.test(train_df$GrLivArea, train_df$SalePrice, conf.level = 0.8)$conf.int[1:2], 2)

## [1] 0.69 0.72

round(cor.test(train_df$YrSold, train_df$SalePrice, conf.level = 0.8)$conf.int[1:2], 2)

## [1] -0.06  0.00

round(cor.test(train_df$YrSold, train_df$GrLivArea, conf.level = 0.8)$conf.int[1:2], 2)

## [1] -0.07  0.00

# Zero is not in the interval for the GrLivArea/SalePrice pair, but it is for the other two, indicating that we cannot be sure the correlation was not due to chance alone.

# In general I think it is very important to be cautious of familywise error, i.e., finding spurious correlations, particularly when dealing with over 75 different variables, The higher the number of variables, the higher the probability of finding correlations by chance. We can mitigate this risk by using domain knowledge for variable selection and/or applying sensible Bayesian priors.

### Linear Algebra
prec_mat <- solve(cor_mat)
prec_mat

##                YrSold   GrLivArea    SalePrice
## YrSold    1.001354525  0.03224312  0.006113496
## GrLivArea 0.032243122  2.00966991 -1.423168733
## SalePrice 0.006113496 -1.42316873  2.008669018

round(cor_mat %*% prec_mat, 2)

##           YrSold GrLivArea SalePrice
## YrSold         1         0         0
## GrLivArea      0         1         0
## SalePrice      0         0         1

round(prec_mat %*% cor_mat, 2)

##           YrSold GrLivArea SalePrice
## YrSold         1         0         0
## GrLivArea      0         1         0
## SalePrice      0         0         1

LU_decomp <- function(A){
  
  ### Check to make sure matrix is not singular, rectangular, or comprised of zeros on pivot
  if(sum(diag(A)==0)>0 | qr(A)$rank<max(nrow(A), ncol(A))) {
    message("Error: This matrix cannot be decomposed.")
  } else {
    U <- A
    # Get U by going column by column and algebraically putting zeros in all positions below the diagonal
    for (j in 1:(ncol(U)-1)){
      for (i in (j+1):nrow(U)){
        if (U[i,j]!=0){U[i,] <- U[i, ] - U[j, ]*(U[i,j]/U[j,j])}
      }
    }
    
    # Get L by restating L U = A  as  U_inv L_inv = A_inv  ,solving for L_inv, and then taking inverse to get L 
    U_inv <- solve(U)
    A_inv <- solve(A)
    L_inv <- solve(U_inv, A_inv)
    L <- solve(L_inv)
    list(L=round(L,2) , U=round(U,2))
  }
}

LU_decomp(cor_mat)

## $L
##           YrSold GrLivArea SalePrice
## YrSold      1.00      0.00         0
## GrLivArea  -0.04      1.00         0
## SalePrice  -0.03      0.71         1
## 
## $U
##           YrSold GrLivArea SalePrice
## YrSold         1     -0.04     -0.03
## GrLivArea      0      1.00      0.71
## SalePrice      0      0.00      0.50

LU_decomp(prec_mat)

## $L
##           YrSold GrLivArea SalePrice
## YrSold      1.00      0.00         0
## GrLivArea   0.03      1.00         0
## SalePrice   0.01     -0.71         1
## 
## $U
##           YrSold GrLivArea SalePrice
## YrSold         1      0.03      0.01
## GrLivArea      0      2.01     -1.42
## SalePrice      0      0.00      1.00

### Check to see which variables are right skewed
num_inds <- which(sapply(1:ncol(train_df), function(x) class(train_df[, x]))!="factor") #index of numerical variables
train_df %>% 
  select(num_inds) %>% 
  gather(Id) %>% 
  filter(!is.na(value)) %>% 
  ggplot(aes(x=value)) + geom_histogram(bins=20) + facet_wrap(~Id, scales = 'free_x') #histograms

### Fit exponential distribution to OpenPorchSF
lam <- MASS::fitdistr(train_df$OpenPorchSF, "exponential")$estimate %>% unname()
sim_vals <- rexp(nrow(train_df), lam)
train_df %>% 
  mutate(sim_OPSF = sim_vals) %>% 
  select(Id, sim_OPSF, OpenPorchSF) %>% 
  gather(Id) %>% 
  filter(!is.na(value)) %>% 
  ggplot(aes(x=value)) + geom_histogram(bins=20) + facet_wrap(~Id, scales = 'free_x') #compare histograms

### Theoretical 5th and 95th percentiles
qexp(p = c(0.05, 0.95), rate = lam)

## [1]   2.393359 139.781689

### Empirical percentiles
quantile(train_df$OpenPorchSF, c(0.05, 0.95))

##     5%    95% 
##   0.00 175.05

# The fit is clearly not perfect. The empirical distribution seems to have fatter tails.

### Empirical confidence interval for mean, using asymptotic CI based on Fisher information
xbar <- mean(train_df$OpenPorchSF)
z <- 1.96
n <- length(train_df$OpenPorchSF)
se <- sqrt(xbar/n)
c(xbar-1.96*se, xbar+1.96*se)

## [1] 46.30988 47.01067

### Multiple regression model
lm_fit <- lm(SalePrice ~ LotArea + Neighborhood + BldgType + OverallCond + GrLivArea + YrSold, data=train_df)
summary(lm_fit)

## 
## Call:
## lm(formula = SalePrice ~ LotArea + Neighborhood + BldgType + 
##     OverallCond + GrLivArea + YrSold, data = train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -320405  -19112   -1042   15121  275473 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          6.863e+05  1.550e+06   0.443 0.658066    
## LotArea              5.387e-01  1.163e-01   4.631 3.96e-06 ***
## NeighborhoodBlueste -4.555e+04  2.922e+04  -1.559 0.119310    
## NeighborhoodBrDale  -4.976e+04  1.468e+04  -3.390 0.000718 ***
## NeighborhoodBrkSide -8.464e+04  1.170e+04  -7.233 7.70e-13 ***
## NeighborhoodClearCr -5.075e+04  1.298e+04  -3.909 9.70e-05 ***
## NeighborhoodCollgCr -2.638e+04  1.078e+04  -2.447 0.014516 *  
## NeighborhoodCrawfor -4.603e+04  1.172e+04  -3.929 8.95e-05 ***
## NeighborhoodEdwards -8.317e+04  1.108e+04  -7.507 1.06e-13 ***
## NeighborhoodGilbert -4.502e+04  1.130e+04  -3.984 7.11e-05 ***
## NeighborhoodIDOTRR  -9.986e+04  1.231e+04  -8.110 1.08e-15 ***
## NeighborhoodMeadowV -5.926e+04  1.368e+04  -4.330 1.59e-05 ***
## NeighborhoodMitchel -4.997e+04  1.177e+04  -4.245 2.33e-05 ***
## NeighborhoodNAmes   -6.889e+04  1.080e+04  -6.381 2.38e-10 ***
## NeighborhoodNoRidge  3.041e+04  1.216e+04   2.501 0.012485 *  
## NeighborhoodNPkVill -3.220e+04  1.620e+04  -1.988 0.047001 *  
## NeighborhoodNridgHt  6.805e+04  1.095e+04   6.214 6.75e-10 ***
## NeighborhoodNWAmes  -6.052e+04  1.138e+04  -5.317 1.22e-07 ***
## NeighborhoodOldTown -1.010e+05  1.115e+04  -9.061  < 2e-16 ***
## NeighborhoodSawyer  -7.100e+04  1.146e+04  -6.197 7.53e-10 ***
## NeighborhoodSawyerW -4.202e+04  1.139e+04  -3.690 0.000233 ***
## NeighborhoodSomerst  3.370e+03  1.082e+04   0.312 0.755449    
## NeighborhoodStoneBr  6.733e+04  1.240e+04   5.430 6.63e-08 ***
## NeighborhoodSWISU   -1.083e+05  1.303e+04  -8.314  < 2e-16 ***
## NeighborhoodTimber  -9.946e+03  1.228e+04  -0.810 0.418217    
## NeighborhoodVeenker  4.321e+03  1.540e+04   0.281 0.779113    
## BldgType2fmCon      -2.221e+04  7.312e+03  -3.037 0.002432 ** 
## BldgTypeDuplex      -2.964e+04  5.703e+03  -5.198 2.31e-07 ***
## BldgTypeTwnhs       -5.041e+04  7.738e+03  -6.515 1.01e-10 ***
## BldgTypeTwnhsE      -2.363e+04  4.897e+03  -4.825 1.55e-06 ***
## OverallCond          6.870e+03  1.014e+03   6.778 1.78e-11 ***
## GrLivArea            7.452e+01  2.372e+00  31.412  < 2e-16 ***
## YrSold              -3.047e+02  7.723e+02  -0.395 0.693201    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38730 on 1427 degrees of freedom
## Multiple R-squared:  0.7675, Adjusted R-squared:  0.7623 
## F-statistic: 147.2 on 32 and 1427 DF,  p-value: < 2.2e-16

# Check residuals
lm_df <- augment(lm_fit)
    
# residual plot
lm_df %>% 
  ggplot(aes(x = .fitted, y = .resid)) + 
  geom_point() + 
  geom_hline(yintercept = 0)

# residual histogram
lm_df %>% 
  ggplot(aes(.resid)) + 
  geom_histogram(bins = 10)

# residual qqplot
lm_df %>% 
  ggplot(aes(sample = .resid)) + 
  stat_qq() + 
  stat_qq_line()

# Make predictions on test data
yhats <- predict(lm_fit, newdata = test_df)
yhat_df <- data.frame(Id = test_df$Id, SalePrice = yhats %>% unname())
yhat_df %>% head() %>% kable()

Id	SalePrice
1461	119102.6
1462	152796.3
1463	191923.8
1464	194855.7
1465	249880.3
1466	191798.4

The predictions were entered into the competition under “mehtablocker” and received a score of 0.20281.

CUNY 605

Final Project

mehtablocker

May 17, 2019

Load libraries

Problem 1

Problem 2