#Libraries required
library(dplyr)
library(Matrix)
library(ggplot2)
library(MASS)

Problem 1:

Using R, generate a random variable X that has 10,000 random uniform numbers from 1 to N, where N can be any number of your choosing greater than or equal to 6. Then generate a random variable Y that has 10,000 random normal numbers with a mean of \(\mu= \sigma= (N+1)/2\).

set.seed(800)
#Choose N value:
N <- 8

#Set count number:
count <- 10000
mu <- (N+1)/2

#Generate X variable of 10000 random uniform numbers between 1 and N:

X <- runif(count, min=1, max=N)
summary(X)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.001   2.722   4.471   4.491   6.242   7.999
#Generate Y variable of 10000 random normal numbers with specified mu=sigma=(N+1)/2

Y <- rnorm(count, mean=mu, sd = mu)
summary(Y)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -12.090   1.553   4.513   4.553   7.628  20.750
#Create a dataframe:
XY_data <- data.frame(x = X, y = Y)

Probability.

Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the median of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities.

  1. P(X>x | X>y)

  2. P(X>x, Y>y)

  3. P(X<x | X>y)

Answer

Define x and y variables:

x is estimated as the median of the X variable = 4.471

y is estimated as the 1st quartile of the Y variable = 1.553

a. P(X>x | X>y)

x <- 4.522
y <- 1.524

#Calculate the probability P(X>x):
P1 <- sum(X > x) / 10000

#Calculate the probability P(X>y):
P2 <- sum(X > y) / 10000

#Conditional probability = P(AnB) / P(B)
cond_prob <- P1 / P2
round(cond_prob,3)
## [1] 0.537

The probability that X is greater than the median of x.

P(X>x | X>y) = 0.537.

b. P(X>x, Y>y)

P3 <- sum(X > x) / 10000

P4 <- sum(Y > y) / 10000
#Joint probability = P(A)*P(B)
Joint <- P3 * P4
round(Joint, 3)
## [1] 0.371

P(X>x, Y>y) = 0.375.

c. P(X<x | X>y)

#Calculate the probability P(X<x):
P5 <- sum(X < x) / 10000

#Calculate the probability P(X>y):
P6 <- sum(X > y) / 10000

#Conditional probability = P(AnB) / P(B)
cond_prob2 <- (P6 - P4) / P6

P5
## [1] 0.5062
P6
## [1] 0.9204
round(cond_prob2,3)
## [1] 0.183
  1. Investigate whether P(X>x and Y>y)=P(X>x)P(Y>y) by building a table and evaluating the marginal and joint probabilities.
#Initialize probabilities of interest
#P(X>x)
X_gt <- round(sum(X > x) / 10000,3)

#P(X<x)
X_lt <- round(sum(X < x) / 10000,3) 

#P(Y>y)
Y_gt <- round(sum(Y > y) / 10000,3) 

#P(Y<y)
Y_lt <- round(sum(Y < y) / 10000,3) 


total <- X_gt + X_lt #or Y_gt + Y_lt ... either way it's 1.0

#Calculate joint probabilities

p22 <- X_lt * Y_lt #P(X<x,Y<y)

p11 <- X_gt * Y_gt #P(X>x,Y>y)

p12 <- X_lt * Y_gt #P(X<x,Y>y)

p21 <- X_gt * Y_lt #P(X>x,Y<y)

#Populate table with marginal and joint probabilities
##The table is populated column by column, top to bottom
p <- matrix(c("", "", "Y", "", "", "", "", ">y", "<y", "marginal (X)", "", ">x",p11, p21, X_gt, "X", "<x", p12, p22, X_lt, "", "marginal (Y)",Y_gt, Y_lt, total), ncol=5)


mat <- data.frame(p)
mat
##   X1           X2       X3       X4           X5
## 1                                 X             
## 2                       >x       <x marginal (Y)
## 3  Y           >y 0.371488 0.380512        0.752
## 4              <y 0.122512 0.125488        0.248
## 5    marginal (X)    0.494    0.506            1

As shown in the table above, the joint probability is the product of corresponding marginal probabilities. Therefore, P(X>x and Y>y) = P(X>x)P(Y>y).

  1. Check to see if independence holds by using Fisher’s Exact Test and the Chi Square Test. What is the difference between the two? Which is most appropriate?
table <- matrix(c(3714, 1225, 3805, 1254), ncol=2)
table
##      [,1] [,2]
## [1,] 3714 3805
## [2,] 1225 1254
# Fisher's Exact Test:
fisher.test(table)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table
## p-value = 1
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.9115072 1.0953263
## sample estimates:
## odds ratio 
##  0.9991915
# Chi Square Test:
chisq.test(table)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table
## X-squared = 2.7507e-29, df = 1, p-value = 1

We see that each test produces a p-value of 1.

The main differences between Fisher’s Exact Test and the Chi Square Test is that Fisher’s is more conducted on small sample sizes while the Chi Square Test is more for large sample sizes.

Since we have a large sample sizes of 10,000, the Chi Square Test is the better test for our case.

Problem 2:

Register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques.

Descriptive and Inferential Statistics.

Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any three quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

We start by downloading the data from Kaggle website via the link above and uploaded to Github and then read it.

#Load the housing data 

Train_Data <- read.csv("https://github.com/GehadGad/housing-train-data-/raw/main/train.csv")

Test_Data <- read.csv("https://github.com/GehadGad/Housing-test-data/raw/main/test.csv")
#Check the dimension of the dataset

dim(Train_Data)
## [1] 1460   81

This data contain 1460 observations (rows) and 81 features(columns).

Select three independent variables and a dependent variable to perform statistical operations

Dependent Variable: SalePrice Independent Variable: GarageArea Independent Variable: GrLivArea Independent Variable: LotArea

Get the summary for our selected variables:

The summary statistics provide the min vs. max, median and mean values, and the 1st quartile vs. 3rd quartile for the given variable.

summary(Train_Data$SalePrice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129975  163000  180921  214000  755000
summary(Train_Data$GarageArea)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   334.5   480.0   473.0   576.0  1418.0
summary(Train_Data$GrLivArea) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     334    1130    1464    1515    1777    5642
summary(Train_Data$LotArea) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1300    7554    9478   10517   11602  215245

Graphs

hist(Train_Data$SalePrice, xlab = "SalePrice", main = "Sale Price Histogram", col = "blue")

hist(Train_Data$GarageArea, xlab = "SalePrice", main = "Garage Area Histogram", col = "blue")

# Scatterplot of GrLivArea and GarageArea vs SalePrice
par(mfrow=c(1,2))
plot(Train_Data$GrLivArea, Train_Data$SalePrice, xlab="GrLivArea", ylab="SalePrice", main="GrLivArea vs SalePrice")
plot(Train_Data$GarageArea, Train_Data$SalePrice, xlab="GarageArea", ylab="SalePrice", main="GarageArea vs SalePrice")

There are no significant correlation between the (GrLivArea)Above grade (ground) living area square feet and the SalePrice but there is some correlation between the GarageArea and the SalePrice.This is expected as we know large garage is linked/associated with large house which affect the sale price of the house.

#BoxPlot and Histograms for the LotArea
par(mfrow=c(1,2))
boxplot(Train_Data$LotArea, main="LotArea BoxPlot")
hist(Train_Data$LotArea, breaks = 20, main = "LotArea Histogram")

The distribution of the histogram is right skewed and there are many outliers in the boxplot.

library(corrplot)
## corrplot 0.84 loaded
corr.df = Train_Data[c("GrLivArea", "LotArea", "GarageArea", "SalePrice")]
corr.matrix = cor(corr.df, use = "complete.obs")
print(corr.matrix)
##            GrLivArea   LotArea GarageArea SalePrice
## GrLivArea  1.0000000 0.2631162  0.4689975 0.7086245
## LotArea    0.2631162 1.0000000  0.1804028 0.2638434
## GarageArea 0.4689975 0.1804028  1.0000000 0.6234314
## SalePrice  0.7086245 0.2638434  0.6234314 1.0000000
#visualize the correlation matrix
corrplot(corr.matrix, method = "circle")

We can the correlation matrix to use the (cor.test function) to test whether the hypotheses that the correlation between each pairwise set of variables is 0 (provided an 80% confidence interval).

Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval.Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

cor.test(Train_Data$GarageArea, Train_Data$SalePrice, method = 'pearson', conf.level = 0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  Train_Data$GarageArea and Train_Data$SalePrice
## t = 30.446, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.6024756 0.6435283
## sample estimates:
##       cor 
## 0.6234314

The p-value is very close to 0 and the confidence interval is between 0.602 and 0.643, with a positive correlation of 0.623, which is a finding of high correlation between GrLivArea and SalePrice. This reject the null hypothesis of no correlation between variables.

cor.test(Train_Data$GrLivArea, Train_Data$SalePrice, method = 'pearson', conf.level = 0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  Train_Data$GrLivArea and Train_Data$SalePrice
## t = 38.348, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.6915087 0.7249450
## sample estimates:
##       cor 
## 0.7086245

The p-value is very close to 0 and the confidence interval is between 0.691 and 0.725, with a positive correlation of 0.709, which is a finding of high correlation between GrLivArea and SalePrice. This reject the null hypothesis of no correlation between variables.

Family-wise error: The fact that there are many variables in the train dataset, this could have a huge impact on the correlation of the peirwise variables. In this case, I would be worried.

Linear Algebra and Correlation.

Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.)

print(corr.matrix)
##            GrLivArea   LotArea GarageArea SalePrice
## GrLivArea  1.0000000 0.2631162  0.4689975 0.7086245
## LotArea    0.2631162 1.0000000  0.1804028 0.2638434
## GarageArea 0.4689975 0.1804028  1.0000000 0.6234314
## SalePrice  0.7086245 0.2638434  0.6234314 1.0000000
#Invert the Correlation Matrix using the solve() function
precision_matrix = solve(corr.matrix)
print(precision_matrix)
##             GrLivArea     LotArea  GarageArea  SalePrice
## GrLivArea   2.0386551 -0.16538092 -0.08646280 -1.3471026
## LotArea    -0.1653809  1.08871686 -0.02097865 -0.1569790
## GarageArea -0.0864628 -0.02097865  1.64016481 -0.9557256
## SalePrice  -1.3471026 -0.15697899 -0.95572557  2.5918371

Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix.

##Multiply the correlation matrix by the precision matrix

corr.matrix %*% precision_matrix %>% round()
##            GrLivArea LotArea GarageArea SalePrice
## GrLivArea          1       0          0         0
## LotArea            0       1          0         0
## GarageArea         0       0          1         0
## SalePrice          0       0          0         1
# Multiply precision matrix by the correlation matrix
precision_matrix %*% corr.matrix %>% round()
##            GrLivArea LotArea GarageArea SalePrice
## GrLivArea          1       0          0         0
## LotArea            0       1          0         0
## GarageArea         0       0          1         0
## SalePrice          0       0          0         1

We can see that all entries are identical, thus confirming that corr∗prec = prec∗corr.

Conduct LU decomposition on the matrix.

#conduct LU decomposition
lu = lu(corr.matrix)
elu = expand(lu)

#Lower triangular matrix
cor_L = elu$L
print(cor_L)
## 4 x 4 Matrix of class "dtrMatrix" (unitriangular)
##      [,1]       [,2]       [,3]       [,4]      
## [1,] 1.00000000          .          .          .
## [2,] 0.26311617 1.00000000          .          .
## [3,] 0.46899748 0.06124171 1.00000000          .
## [4,] 0.70862448 0.08314923 0.36874445 1.00000000
#Upper triangular matrix
cor_U = elu$U
print(cor_U)
## 4 x 4 Matrix of class "dtrMatrix"
##      [,1]       [,2]       [,3]       [,4]      
## [1,] 1.00000000 0.26311617 0.46899748 0.70862448
## [2,]          . 0.93076988 0.05700194 0.07739280
## [3,]          .          . 0.77655047 0.28634868
## [4,]          .          .          . 0.38582671

Calculus-Based Probability & Statistics.

Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary.

#select variable skewed to the right
Train_Data %>% ggplot(aes(LotArea)) +
  geom_histogram(fill="blue") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Then load the MASS package and run fitdistr to fit an exponential probability density function.

#Fit Exponential distribution
fit_dis <- fitdistr(Train_Data$LotArea, "exponential")
fit_dis
##        rate    
##   9.508570e-05 
##  (2.488507e-06)

Find the optimal value of  for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, )).

#find optimal lambda and take 1000 samples
optimal <- fit_dis$estimate
samples <- rexp(1000, optimal)
head(samples)
## [1] 10375.353  7705.024 20920.196  2623.708  6892.716  4003.110

Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

#Plot the result vs. original
par(mfrow = c(1, 2))
hist(Train_Data$LotArea, xlab = "LotArea", main = "Original Lot Area Histogram", col = "blue")
hist(samples, xlab = "LotArea", main = "Exponential Lot Area Histogram", col = "gray")

#Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF)

quantile(optimal, c(0.05, 0.95))
##          5%         95% 
## 9.50857e-05 9.50857e-05
#5th and 95th percentile:
quantile(Train_Data$LotArea, c(0.05, 0.95))
##       5%      95% 
##  3311.70 17401.15

Modeling.

Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.

I chose the following variables based on their summary statistics (numerical) and possibility of having a correlation to the SalePrice: (GrLivArea, LotArea, FullBath, BedroomAbvGr,, TotRmsAbvGrd, PoolArea, OverallQual, GarageCars)

We need to convert categorical variables to numerical values and NAs were replaced with zeros for convenience.

summary(Train_Data$GarageArea)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   334.5   480.0   473.0   576.0  1418.0
## 
## Call:
## lm(formula = SalePrice ~ GarageArea + GrLivArea + LotArea + FullBath + 
##     BedroomAbvGr + TotRmsAbvGrd + PoolArea + OverallQual + GarageCars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -413635  -20333   -1218   18140  296801 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -8.336e+04  6.271e+03 -13.293  < 2e-16 ***
## GarageArea    3.148e+01  1.047e+01   3.006 0.002697 ** 
## GrLivArea     5.132e+01  4.225e+00  12.149  < 2e-16 ***
## LotArea       7.864e-01  1.076e-01   7.309 4.44e-13 ***
## FullBath      3.076e+03  2.583e+03   1.191 0.233898    
## BedroomAbvGr -1.149e+04  1.810e+03  -6.352 2.84e-10 ***
## TotRmsAbvGrd  2.150e+03  1.316e+03   1.634 0.102440    
## PoolArea     -1.217e+01  2.621e+01  -0.464 0.642520    
## OverallQual   2.584e+04  1.099e+03  23.518  < 2e-16 ***
## GarageCars    1.125e+04  3.100e+03   3.629 0.000294 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39190 on 1450 degrees of freedom
## Multiple R-squared:  0.7581, Adjusted R-squared:  0.7566 
## F-statistic: 504.9 on 9 and 1450 DF,  p-value: < 2.2e-16

Having a high R-squared value (0.757) and a low p-value (2.2e-16), this indicate a good model.

step_lm <- stepAIC(house.lm, trace=FALSE)
summary(step_lm)
## 
## Call:
## lm(formula = SalePrice ~ GarageArea + GrLivArea + LotArea + BedroomAbvGr + 
##     TotRmsAbvGrd + OverallQual + GarageCars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -421423  -19873   -1244   17879  297041 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -8.355e+04  6.269e+03 -13.327  < 2e-16 ***
## GarageArea    3.001e+01  1.041e+01   2.883    0.004 ** 
## GrLivArea     5.204e+01  4.065e+00  12.801  < 2e-16 ***
## LotArea       7.807e-01  1.075e-01   7.263 6.16e-13 ***
## BedroomAbvGr -1.126e+04  1.797e+03  -6.268 4.82e-10 ***
## TotRmsAbvGrd  2.270e+03  1.309e+03   1.735    0.083 .  
## OverallQual   2.615e+04  1.070e+03  24.436  < 2e-16 ***
## GarageCars    1.199e+04  3.046e+03   3.936 8.66e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39190 on 1452 degrees of freedom
## Multiple R-squared:  0.7578, Adjusted R-squared:  0.7566 
## F-statistic:   649 on 7 and 1452 DF,  p-value: < 2.2e-16
#RO_test <- Test_Data %>%
 # filter(LotArea < 30000) %>%
  #filter(GrLivArea < 3200)

#Subset data based on columns of interest
Test_Data1 <- subset(Test_Data, select=c(GarageArea, GrLivArea, LotArea, FullBath, BedroomAbvGr, TotRmsAbvGrd, PoolArea, OverallQual, GarageCars))

#Account for NA values
New_Test2 <- na.omit(Test_Data1)
test_pred = predict(house.lm, newdata = New_Test2)
#Compare actual vs. predictive summary statistics

summary(test_pred)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -8002  127665  169993  178212  220341  552944
summary(Train_Data$SalePrice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129975  163000  180921  214000  755000
rpckgd = cbind(Test_Data$Id, test_pred)
## Warning in cbind(Test_Data$Id, test_pred): number of rows of result is not a
## multiple of vector length (arg 2)
colnames(rpckgd) = c("Id", "SalePrice")
rpckgd[rpckgd<0] <- 0 #account for negative values
Final_Pred = as.data.frame(rpckgd)
head(Final_Pred) #verify output
##     Id SalePrice
## 1 1461  126040.9
## 2 1462  153683.1
## 3 1463  162571.7
## 4 1464  185873.9
## 5 1465  225346.6
## 6 1466  187564.4
#write.csv(Final_Pred, "HousePricePreds.csv", row.names=FALSE)

Kaggle Submission - Score

Kaggle Username: gehadgad

Kaggle Score: 0.43296