YouTube: https://www.youtube.com/watch?v=g6ddpZHTp24

library(tidyverse)
library(knitr)
library(kableExtra)
library(matrixcalc)

Problem 1

Using R, generate a random variable X that has 10,000 random uniform numbers from 1 to N, where N can be any number of your choosing greater than or equal to 6. Then generate a random variable Y that has 10,000 random normal numbers with a mean of \(\mu=\sigma=\frac{N+1}{2}\)

set.seed(12345)
N <- 10
n <- 10000
mu <- sigma <- (N + 1)/2
df <- data.frame(X = runif(n, min = 1, max = N), Y = rnorm(n, mean = mu, sd = sigma))

summary(df$X)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   3.283   5.541   5.506   7.742  10.000

summary(df$Y)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -14.664   1.846   5.482   5.500   9.154  26.766

Question 1

Probability. Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the median of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities.

x <- quantile(df$X, 0.5)  # or use median(df$X)
x

##      50% 
## 5.540952

y <- quantile(df$Y, 0.25)
y

##      25% 
## 1.846076

P(X>x | X>y)

Probability of A occurring, given that B has occurred - P(A|B)

pba_a <- df %>% filter(X > x, X > y) %>% nrow()/n
pb_a <- df %>% filter(X > y) %>% nrow()/n

Qa <- pba_a/pb_a
Qa

## [1] 0.5512679

P(X>x, Y>y)

Qb = df %>% filter(X > x, Y > y) %>% nrow()/n
Qb

## [1] 0.3808

P(X<x | X>y)

pba_c = df %>% filter(X < x, X > y) %>% nrow()/n
pb_c = df %>% filter(X > y) %>% nrow()/n

Qc = pba_c/pb_c
Qc

## [1] 0.4487321

Question 2

Investigate whether P(X>x and Y>y)=P(X>x)P(Y>y) by building a table and evaluating the marginal and joint probabilities.

# Probability - Joint
inv = df %>% mutate(A = ifelse(X > x, " X > x", " X < x"), B = ifelse(Y > y, " Y > y", " Y < y")) %>% group_by(A, B) %>% summarise(count = n()) %>% mutate(probability = count/n)

# Probability - Marginal
inv = inv %>% ungroup() %>% group_by(A) %>% summarize(count = sum(count), probability = sum(probability)) %>% mutate(B = "Total") %>% bind_rows(inv)

inv = inv %>% ungroup() %>% group_by(B) %>% summarize(count = sum(count), probability = sum(probability)) %>% mutate(A = "Total") %>% bind_rows(inv)

# Table
inv %>% select(-count) %>% spread(A, probability) %>% rename(` ` = B) %>% kable() %>% kable_styling()

	X < x	X > x	Total
Y < y	0.1308	0.1192	0.25
Y > y	0.3692	0.3808	0.75
Total	0.5000	0.5000	1.00

P(X>x and Y>y) = 0.3808
P(X>x) P(Y>y) = 0.5 x 0.75 = 0.375

They are not the same

Question 3

Check to see if independence holds by using Fisher’s Exact Test and the Chi Square Test. What is the difference between the two? Which is most appropriate?

# Fisher’s Exact Test
count_data = inv %>% filter(A != "Total", B != "Total") %>% select(-probability) %>% spread(A, count) %>% as.data.frame()
row.names(count_data) = count_data$B
count_data = count_data %>% select(-B) %>% as.matrix()

fisher.test(count_data)

## 
##  Fisher's Exact Test for Count Data
## 
## data:  count_data
## p-value = 0.007904
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  1.032680 1.240419
## sample estimates:
## odds ratio 
##   1.131777

chisq.test(count_data)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  count_data
## X-squared = 7.0533, df = 1, p-value = 0.007912

Difference between Fisher’s Exact test and Chi-square test

Fisher’s Exact test is a way to test the association between two categorical variables when you have small cell sizes (expected values less than 5). Chi-square test is used when the cell sizes are expected to be large.

Chi-square test would be most appropriate

Problem 2

You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques.

# Data Load
kdata = read.csv("https://raw.githubusercontent.com/monuchacko/cuny_msds/master/data_605/train_house.csv", header = TRUE) %>% filter(GrLivArea < 4000)

# Smooth data replace N/A's
data = function(df) {
    df %>% 
    mutate(BedroomAbvGr = replace_na(BedroomAbvGr, 0), 
           BsmtFullBath = replace_na(BsmtFullBath, 0), 
           BsmtHalfBath = replace_na(BsmtHalfBath, 0), 
           BsmtUnfSF = replace_na(BsmtUnfSF, 0), 
           EnclosedPorch = replace_na(EnclosedPorch, 0), 
           Fireplaces = replace_na(Fireplaces, 0), 
           GarageArea = replace_na(GarageArea, 0), 
           GarageCars = replace_na(GarageCars, 0), 
           HalfBath = replace_na(HalfBath, 0), 
           KitchenAbvGr = replace_na(KitchenAbvGr, 0), 
           LotFrontage = replace_na(LotFrontage, 0), 
           OpenPorchSF = replace_na(OpenPorchSF, 0), 
           PoolArea = replace_na(PoolArea, 0), 
           ScreenPorch = replace_na(ScreenPorch, 0), 
           TotRmsAbvGrd = replace_na(TotRmsAbvGrd, 0), 
           WoodDeckSF = replace_na(WoodDeckSF, 0))
}

kdata = data(kdata)
#print(kdata)

Question 1

Descriptive and Inferential Statistics. Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any three quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

Descriptive Statistics

Descriptive statistics provide information about the central location (central tendency), dispersion (variability or spread), and shape of the distribution.

summary(kdata$LotArea)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1300    7539    9468   10449   11588  215245

hist(kdata$LotArea, xlab = "Lot Area", main = "Histogram of Lot Area", col = "#9eb4cc")

summary(kdata$OverallQual)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   5.000   6.000   6.089   7.000  10.000

hist(kdata$OverallQual, xlab = "Overall Quality", main = "Histogram of Overall Quality", col = "#ffc0cb")

summary(kdata$SalePrice)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129900  163000  180151  214000  625000

hist(kdata$SalePrice, xlab = "Lot Area", main = "Histogram of Sale Price", col = "#d59cff")

Scatterplot

kdata %>%
  select(TotalBsmtSF, GrLivArea, YearBuilt, SalePrice) %>%
  pairs()

# Lot Area vs. Sale Price
ggplot(kdata, aes(x = LotArea, y = SalePrice)) + geom_point() + theme(axis.text.x = element_text(angle = 60, hjust = 1))

# Overall Quality vs. Sale Price
ggplot(kdata, aes(x = OverallQual, y = SalePrice)) + geom_point(aes(color = factor(OverallQual))) + 
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

Correlation Matrix

A correlation matrix is simply a table which displays the correlation coefficients for different variables. The matrix depicts the correlation between all the possible pairs of values in a table. It is a powerful tool to summarize a large dataset and to identify and visualize patterns in the given data.

library(corrplot)
library("PerformanceAnalytics")

correlationmatrix = kdata %>% select(LotArea, OverallQual, SalePrice) %>% cor() %>% as.matrix()
correlationmatrix

##                LotArea OverallQual SalePrice
## LotArea     1.00000000  0.08871877 0.2698665
## OverallQual 0.08871877  1.00000000 0.8008584
## SalePrice   0.26986648  0.80085836 1.0000000

corrplot(correlationmatrix, type = "upper", order = "hclust", tl.col = "black", tl.srt = 45)

chart.Correlation(correlationmatrix, histogram=TRUE, pch=19)

Test Hypothesis

Correlation test is used to evaluate the association between two or more variables.

Pearson correlation (r), which measures a linear dependence between two variables (x and y). It’s also known as a parametric correlation test because it depends to the distribution of the data. It can be used only when x and y are from normal distribution. The plot of y = f(x) is named the linear regression curve. Assumption for Pearson correlation - level of measurement, related pairs, absence of outliers, normality of variables, linearity, and homoscedasticity
Kendall tau and Spearman rho, which are rank-based correlation coefficients (non-parametric)

cor.test(kdata$LotArea, kdata$SalePrice, conf.level = 0.8)

## 
##  Pearson's product-moment correlation
## 
## data:  kdata$LotArea and kdata$SalePrice
## t = 10.687, df = 1454, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.2384211 0.3007466
## sample estimates:
##       cor 
## 0.2698665

cor.test(kdata$OverallQual, kdata$SalePrice, conf.level = 0.8)

## 
##  Pearson's product-moment correlation
## 
## data:  kdata$OverallQual and kdata$SalePrice
## t = 50.994, df = 1454, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.7884724 0.8125951
## sample estimates:
##       cor 
## 0.8008584

Important numbers here are df (degrees of freedom), p-value and Pearson correlation numbers. The p-values here are greater that 0.05 and might not have strong correlation.

Question 2

Linear Algebra and Correlation

Linear Algebra and Correlation. Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.

Precision Matrix

The precision matrix of a random vector is the inverse of its covariance matrix.

precisionmatrix = solve(correlationmatrix)
precisionmatrix

##                LotArea OverallQual  SalePrice
## LotArea      1.1339031   0.4028324 -0.6286141
## OverallQual  0.4028324   2.9315320 -2.4564529
## SalePrice   -0.6286141  -2.4564529  3.1369127

Correlation Matrix by Precision Matrix

mult = round(correlationmatrix %*% precisionmatrix)
mult

##             LotArea OverallQual SalePrice
## LotArea           1           0         0
## OverallQual       0           1         0
## SalePrice         0           0         1

Precision Matrix by Correlation Matrix

mult2 = round(precisionmatrix %*% correlationmatrix)
mult2

##             LotArea OverallQual SalePrice
## LotArea           1           0         0
## OverallQual       0           1         0
## SalePrice         0           0         1

LU Decomposition

decomp = lu.decomposition(correlationmatrix)
decomp

## $L
##            [,1]      [,2] [,3]
## [1,] 1.00000000 0.0000000    0
## [2,] 0.08871877 1.0000000    0
## [3,] 0.26986648 0.7830798    1
## 
## $U
##      [,1]       [,2]      [,3]
## [1,]    1 0.08871877 0.2698665
## [2,]    0 0.99212898 0.7769161
## [3,]    0 0.00000000 0.3187848

correlationmatrix

##                LotArea OverallQual SalePrice
## LotArea     1.00000000  0.08871877 0.2698665
## OverallQual 0.08871877  1.00000000 0.8008584
## SalePrice   0.26986648  0.80085836 1.0000000

Question 3

Calculus-Based Probability & Statistics

Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of \(\lambda\) for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, \(\lambda\))). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

Variable Skewed To The Right

rs = min(kdata$GrLivArea)
rs

## [1] 334

Fit Exponential Probability

Maximum-likelihood Fitting of Univariate Distributions: The goal is to find the optimal way to fit a distribution to a data. Some example of distribution are normal, exponential, gamma etc.

library(MASS)

epb = fitdistr(kdata$GrLivArea, "exponential")
epb

##        rate    
##   6.637893e-04 
##  (1.739601e-05)

Optimal Value

lambda = epb$estimate
lambda

##         rate 
## 0.0006637893

opval = rexp(1000, lambda)
#opval

Histogram

par(mfrow = c(1, 2))
hist(opval, breaks = 50, xlim = c(0, 6000), main = "Exponential - GrLivArea", col = "#F0FFFF")
hist(kdata$GrLivArea, breaks = 50, main = "Original - GrLivArea", col = "#FFE4E1")

Percentiles

qexp(0.05, rate = lambda)

## [1] 77.27345

qexp(0.95, rate = lambda)

## [1] 4513.077

Confidence Interval

A confidence interval (CI) is a range of values that’s likely to include a population value with a certain degree of confidence. It is often expressed a % whereby a population means lies between an upper and lower interval.

library("Rmisc")

# Generate 95% confidence interval assuming normality
CI(na.exclude(kdata$GrLivArea), ci = 0.95)

##    upper     mean    lower 
## 1532.042 1506.502 1480.962

Empirical Percentile

# 5th percentile of the data
quantile(kdata$GrLivArea, 0.05)

##  5% 
## 848

# 95th percentile of the data
quantile(kdata$GrLivArea, 0.95)

##    95% 
## 2450.5

Question 4

Modeling

Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.

Linear Regression

model = kdata[, which(sapply(kdata, function(x) sum(is.na(x))) == 0)]
fit = lm(kdata$SalePrice ~ kdata$OverallQual + kdata$GrLivArea + kdata$GarageCars + 
           kdata$GarageArea, data = kdata)
summary(fit)

## 
## Call:
## lm(formula = kdata$SalePrice ~ kdata$OverallQual + kdata$GrLivArea + 
##     kdata$GarageCars + kdata$GarageArea, data = kdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -132308  -21454   -1728   18215  280617 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1.000e+05  4.461e+03 -22.419  < 2e-16 ***
## kdata$OverallQual  2.670e+04  9.742e+02  27.410  < 2e-16 ***
## kdata$GrLivArea    5.252e+01  2.447e+00  21.466  < 2e-16 ***
## kdata$GarageCars   4.513e+03  2.924e+03   1.543    0.123    
## kdata$GarageArea   6.469e+01  9.913e+00   6.526 9.31e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36880 on 1451 degrees of freedom
## Multiple R-squared:  0.7694, Adjusted R-squared:  0.7688 
## F-statistic:  1210 on 4 and 1451 DF,  p-value: < 2.2e-16

Multiple Linear Regression

par(mfrow = c(2, 2))
X1 = kdata$OverallQual
X2 = kdata$GrLivArea
X3 = kdata$GarageCars
X4 = kdata$GarageArea
Y = kdata$SalePrice

plot(X1, Y, col = "#c2bfbc", main = "OverallQual", ylab = "Sale Price")
abline(lm(Y ~ X1), col = "#f25d9c", lwd = 3)

plot(X2, Y, col = "#c6e2ff", main = "GrLivArea", ylab = "Sale Price")
abline(lm(Y ~ X2), col = "#c2bfbc", lwd = 3)

plot(X3, Y, col = "#aa4371", main = "GarageCars", ylab = "Sale Price")
abline(lm(Y ~ X3), col = "#c2bfbc", lwd = 3)

plot(X4, Y, col = "#37004d", main = "GarageArea", ylab = "Sale Price")
abline(lm(Y ~ X4), col = "#c6e2ff", lwd = 3)

kdatatest = read.csv("https://raw.githubusercontent.com/monuchacko/cuny_msds/master/data_605/test_house.csv", header = TRUE) %>% filter(GrLivArea < 4000)

data = function(df) {
    df %>% # Replace N/A's
    mutate(BedroomAbvGr = replace_na(BedroomAbvGr, 0), 
           BsmtFullBath = replace_na(BsmtFullBath, 0), 
           BsmtHalfBath = replace_na(BsmtHalfBath, 0), 
           BsmtUnfSF = replace_na(BsmtUnfSF, 0), 
           EnclosedPorch = replace_na(EnclosedPorch, 0), 
           Fireplaces = replace_na(Fireplaces, 0), 
           GarageArea = replace_na(GarageArea, 0), 
           GarageCars = replace_na(GarageCars, 0), 
           HalfBath = replace_na(HalfBath, 0), 
           KitchenAbvGr = replace_na(KitchenAbvGr, 0), 
           LotFrontage = replace_na(LotFrontage, 0), 
           OpenPorchSF = replace_na(OpenPorchSF, 0), 
           PoolArea = replace_na(PoolArea, 0), 
           ScreenPorch = replace_na(ScreenPorch, 0), 
           TotRmsAbvGrd = replace_na(TotRmsAbvGrd, 0), 
           WoodDeckSF = replace_na(WoodDeckSF, 0))
}

#kdatatest = data(kdatatest)
#print(kdatatest)

SalePrice = ((26988.854 * df$OverallQual) + (49.573 * df$GrLivArea) + (11317.522 * df$GarageCars) + (41.478 * df$GarageArea) - 98436.05)
kdatatest = kdatatest[, c("Id", "OverallQual", "GrLivArea", "GarageCars", "GarageArea")]
(head(kdatatest))

##     Id OverallQual GrLivArea GarageCars GarageArea
## 1 1461           5       896          1        730
## 2 1462           6      1329          1        312
## 3 1463           5      1629          2        482
## 4 1464           6      1604          2        470
## 5 1465           8      1280          2        506
## 6 1466           6      1655          2        440

ksubmission = cbind(kdatatest$Id, kdata$SalePrice)

## Warning in cbind(kdatatest$Id, kdata$SalePrice): number of rows of result is not
## a multiple of vector length (arg 2)

colnames(ksubmission) = c("Id", "SalePrice")
ksubmission[ksubmission < 0] = median(SalePrice)
ksubmission = as.data.frame(ksubmission[1:1458, ])
head(ksubmission)

##     Id SalePrice
## 1 1461    208500
## 2 1462    181500
## 3 1463    223500
## 4 1464    140000
## 5 1465    250000
## 6 1466    143000

#write.csv(ksubmission, file = "KaggleMC.csv", quote = FALSE, row.names = FALSE)

Kaggle score 0.56288
User: monuchacko

Final Project - Computational Mathematics

Monu Chacko

5/21/2020

Problem 1

Question 1

Question 2

Question 3

Problem 2

Question 1

Descriptive Statistics

Scatterplot

Correlation Matrix

Test Hypothesis

Question 2

Linear Algebra and Correlation

Precision Matrix

Correlation Matrix by Precision Matrix

Precision Matrix by Correlation Matrix

LU Decomposition

Question 3

Calculus-Based Probability & Statistics

Variable Skewed To The Right

Fit Exponential Probability

Optimal Value

Histogram

Percentiles

Confidence Interval

Empirical Percentile

Question 4

Modeling

Linear Regression

Multiple Linear Regression