library(ggplot2)
library(moments)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(MASS)
library(caret)
## Loading required package: lattice
train  = read.csv('math_final_train.csv', header = TRUE)
test  = read.csv('math_final_test.csv', header = TRUE)

ggplot(data = train, aes(x = LotArea) )+ 
  geom_density(alpha = .2, fill = "003333")+
  ggtitle("Lot Area")

I am picking LotArea as the X variable and SalePrice as the Y variable.

summary(train$SalePrice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129975  163000  180921  214000  755000

Probability.

Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 1st quartile of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities. In addition, make a table of counts as shown below.

  1. P(X>x | Y>y)
  2. P(X>x, Y>y)
  3. P(Xy)

The meaning of all these probabilities: a) means the probability of the LotArea being bigger tham the first quartile, given the SalePrice is bigger than the first Quartile.

  1. means that the probability of both scenarios happening.

  2. means the probability that the lotArea is smaller than the 1st quartile given that the SalePrice is bigger than the first quartile.

#Assign quartile values to variables.
XQ1<-quantile(train$LotArea, 0.25)
YQ1<-quantile(train$SalePrice, 0.25)

#Create subsets of data based on quartile operators.
yY <- subset(train,SalePrice <= YQ1)
Yy <- subset(train,SalePrice > YQ1)
Xx_Yy<- subset(Yy, LotArea > XQ1)
Xx_yY<- subset(yY, LotArea > XQ1)
xX_Yy<- subset(Yy, LotArea <= XQ1)
xX_yY<- subset(yY, LotArea <= XQ1)
#for P(X>x|Y>y)
a <- nrow(Xx_Yy)
nrow(Xx_Yy)/nrow(train)
## [1] 0.6150685
#for P(X>x|y<Y)
b <- nrow(Xx_yY)
nrow(Xx_yY)/nrow(train)
## [1] 0.1349315
#for P(X<x|Y>y)
c <- nrow(xX_Yy)
nrow(xX_Yy)/nrow(train)
## [1] 0.1349315
#P(X<x|y<Y)
d <-nrow(xX_yY)
nrow(xX_yY)/nrow(train)
## [1] 0.1150685
table <- matrix(c(d,c,(d+c),b,a,(b+a),(b+d),(c+a),(a+b+c+d)),ncol=3, nrow=3,byrow=TRUE)
colnames(table) <- c("<=1Q", ">1Q", "Total")
rownames(table) <- c('<=1Q', '>1Q', 'Total')
result_table <- as.table(table)
result_table
##       <=1Q  >1Q Total
## <=1Q   168  197   365
## >1Q    197  898  1095
## Total  365 1095  1460

Does splitting the training data in this fashion make them independent?

No, independence describes whether there is a relation between X & Y. Splitting the data doesn’t change the relationship, it just changes the extent of problem domain.

Does P(A|B)=P(A)P(B)?

Let A be the new variable counting those observations above the 3d quartile for X, and let B be the new variable counting those observations for the 2d quartile for Y. Does P(A|B)=P(A)P(B)? Check mathematically, and then evaluate by running a Chi Square test for association.

#Observations above the 1d quartile for X
Xx_A <- subset(train, LotArea >= XQ1)

#Observations for the 1d quartile for Y
YQ1 <- quantile(train$SalePrice, 0.25)
Yy_B_1 <- subset(train, SalePrice <= YQ1)
YY_B_2 <- subset(train, SalePrice >= YQ1)
#P(A|B)
YY_XX <- subset(YY_B_2, LotArea >= XQ1)
res1 <- nrow(YY_XX)/nrow(train)
#P(A)P(B)
res2 <- (nrow(Xx_A)/nrow(train))*nrow(YY_B_2)/nrow(train)
print(c("P(A|B)=P(A)P(B)?: ", res1==res2))
## [1] "P(A|B)=P(A)P(B)?: " "FALSE"

The variables are not independent as described below.

Chi-Square test

chi_table<- table(train$LotArea, train$SalePrice)
suppressWarnings(chisq.test(chi_table))
## 
##  Pearson's Chi-squared test
## 
## data:  chi_table
## X-squared = 735090, df = 709660, p-value < 2.2e-16
## 
##  Pearson's Chi-squared test
## 
## data:  chi_table
## X-squared = 944670, df = 921920, p-value < 2.2e-16

We recieve a p value of < 2.2e-16. Therefore, we reject the null hypothesis that X is Independent of Y.

Descriptive and Inferential Statistics

Variable Density Plots

#Collect summary statistics
LotArea.mean <-mean(train$LotArea)
LotArea.median <-median(train$LotArea)
LotArea.mode <- as.numeric(names(sort(-table(train$LotArea))))[1]
LotArea.sd <- sd(train$LotArea)
SalePrice.mean <-mean(train$SalePrice)
SalePrice.median <-median(train$SalePrice)
SalePrice.mode <- as.numeric(names(sort(-table(train$SalePrice))))[1]
SalePrice.sd <- sd(train$SalePrice)

#Create density plot for LotArea variable
d_LotArea <- density(train$LotArea)
plot(d_LotArea, main="LotArea Probabilities", ylab="Probability", xlab="LotArea")
polygon(d_LotArea, col="red")
abline(v = LotArea.median, col = "green", lwd = 2)
abline(v = LotArea.mean, col = "blue", lwd = 2)
abline(v = LotArea.mode, col = "purple", lwd = 2)
legend("topright", legend=c("median", "mean","mode"),col=c("green", "blue", "purple"), lty=1, cex=0.8)

#Create density plot for SalePrice variable.
SalePrice_for_graph <- density(train$SalePrice, na.rm=TRUE)
plot(SalePrice_for_graph, main="SalePrice Probabilities", ylab="Probability", xlab="SalePrice")
polygon(SalePrice_for_graph, col="red")
abline(v = SalePrice.median, col = "green", lwd = 2)
abline(v = SalePrice.mean, col = "blue", lwd = 2)
abline(v = SalePrice.mode, col = "purple", lwd = 2)
legend("topright", legend=c("median", "mean","mode"),col=c("green", "blue", "purple"), lty=1, cex=0.8 )

Scatterplot

plot(train$LotArea,train$SalePrice, main="LotArea vs SalePrice Scatterplot", 
     xlab="LotArea", ylab="SalePrice", pch=3)

95% confidence Interval

Provide a 95% Confidence Interval for the difference in the mean of the variables.

t.test(train$LotArea,train$SalePrice)
## 
##  Welch Two Sample t-test
## 
## data:  train$LotArea and train$SalePrice
## t = -81.321, df = 1505.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -174514.7 -166294.1
## sample estimates:
## mean of x mean of y 
##  10516.83 180921.20

Again, since the p value is too low, we reject the hypothesis that the difference in means is equal to 0.

Correlation matrix

Derive a correlation matrix for THREE of the quantitative variables you selected.

corMatrix<-cor(train[, which(names(train) %in% c("LotArea", "SalePrice", 'GrLivArea'))])
corMatrix
##             LotArea GrLivArea SalePrice
## LotArea   1.0000000 0.2631162 0.2638434
## GrLivArea 0.2631162 1.0000000 0.7086245
## SalePrice 0.2638434 0.7086245 1.0000000

These results show a very low but possible positive correlation between the data of 0.2638, but the correlation between GrLivArea and SalePrice ssems to be high at 0.7086245

Test the hypothesis

Test the hypothesis that the correlation between these variables is 0 and provide a 92% confidence interval.

t.test(train$LotArea,train$SalePrice, conf.level=0.92)
## 
##  Welch Two Sample t-test
## 
## data:  train$LotArea and train$SalePrice
## t = -81.321, df = 1505.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 92 percent confidence interval:
##  -174075.3 -166733.4
## sample estimates:
## mean of x mean of y 
##  10516.83 180921.20
t.test(train$LotArea,train$GrLivArea, conf.level=0.92)
## 
##  Welch Two Sample t-test
## 
## data:  train$LotArea and train$GrLivArea
## t = 34.411, df = 1467.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 92 percent confidence interval:
##  8543.097 9459.632
## sample estimates:
## mean of x mean of y 
## 10516.828  1515.464
t.test(train$GrLivArea,train$SalePrice, conf.level=0.92)
## 
##  Welch Two Sample t-test
## 
## data:  train$GrLivArea and train$SalePrice
## t = -86.288, df = 1459.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 92 percent confidence interval:
##  -183048.2 -175763.3
## sample estimates:
##  mean of x  mean of y 
##   1515.464 180921.196

The difference for the 3 variables’ means is not 0 in all cases.

Would you be worried about familywise error? Why or why not?

The correlation values are not too high so we don’t have to worry about misidentifying the outcomes. The meaning of this result is that the variables do not say much about the final sale price.

Linear Algebra and Correlation

Invert your correlation matrix (Precision matrix).

corMatrixInverse <- ginv(corMatrix)
corMatrixInverse
##            [,1]       [,2]       [,3]
## [1,]  1.0884485 -0.1664868 -0.1692033
## [2,] -0.1664868  2.0340972 -1.3974846
## [3,] -0.1692033 -1.3974846  2.0349350

The diagonal elements represent variance inflation factors, which measures the relationship between combinations between variables.

Multiply the correlation matrix by the precision matrix.

This is the identity Matrix

matrix1<- corMatrixInverse %*% corMatrix
matrix1
##            LotArea     GrLivArea     SalePrice
## [1,]  1.000000e+00 -2.465530e-16 -8.326673e-17
## [2,] -2.311479e-16  1.000000e+00 -2.220446e-16
## [3,] -1.110223e-16  2.220446e-16  1.000000e+00

Multiply the precision matrix by the correlation matrix.

This represents how matrix products differ depending on the order or direction by which they are multiplied.

matrix2<- corMatrix %*% corMatrixInverse
matrix2
##                    [,1]          [,2]          [,3]
## LotArea    1.000000e+00 -5.580671e-16  1.110223e-16
## GrLivArea -3.848319e-17  1.000000e+00 -2.220446e-16
## SalePrice -8.326673e-17  0.000000e+00  1.000000e+00

Principle Components Analysis

#Perform a log transform on each variable to normalize
dataCopy<-train
dataCopy$LotArea<-log(dataCopy$LotArea)
dataCopy$SalePrice<-log(dataCopy$SalePrice)
dataCopy$GrLivArea<-log(dataCopy$GrLivArea)

#apply PCA and ADD additional parameters for a more interesting interpretation

data.pca<-prcomp(dataCopy[, which(names(dataCopy) %in% c("LotArea", "SalePrice", "GrLivArea"))],center = TRUE,scale = TRUE)
data.pca
## Standard deviations (1, .., p=3):
## [1] 1.4246936 0.8370813 0.5191755
## 
## Rotation (n x k) = (3 x 3):
##                 PC1        PC2         PC3
## LotArea   0.4746802  0.8799375  0.01971454
## GrLivArea 0.6204099 -0.3503987  0.70164972
## SalePrice 0.6243159 -0.3208281 -0.71224926
summary(data.pca)
## Importance of components:
##                           PC1    PC2     PC3
## Standard deviation     1.4247 0.8371 0.51918
## Proportion of Variance 0.6766 0.2336 0.08985
## Cumulative Proportion  0.6766 0.9102 1.00000
biplot(data.pca)

Vectors that point in the same direction correspond to variables that have similar response profiles, and can be interpreted as having similar meaning in the context set by the data, we see here that SalePrice and GrLivArea have very similar vectors pointing to the same direction. that will be corroborated later on the regression technique.

screeplot(data.pca, type="lines")

Calculus-Based Probability & Statistics

We take the LotArea data and fit it to an exponential function.

lambda<-fitdistr(train$LotArea,"exponential")
lambda$estimate
##        rate 
## 9.50857e-05
#Transpose the rate into 1000 selected variables as an exponential distribution
pdf_distr<-rexp(1000, lambda$estimate)

#Plot the results of the exponential distribution
hist(pdf_distr, freq = FALSE, breaks = 100, main ="Fitted Exponential PDF with LotArea", xlim = c(1, quantile(pdf_distr, 0.99)))
curve(dexp(x, rate = lambda$estimate), col = "green", add = TRUE)

#Plot the results as compared to the original data
hist(train$LotArea, freq = FALSE, breaks = 100, main ="Exponential VS original LotArea data",xlim = c(1, quantile(train$LotArea, 0.99)))
curve(dexp(x, rate = lambda$estimate), col = "green", add = TRUE)

With the exponential PDF:

5th and 95th percentiles using the cumulative distribution function (CDF)

qexp(0.05, rate = lambda$estimate, lower.tail = TRUE, log.p = FALSE)
## [1] 539.4428
qexp(0.95, rate = lambda$estimate, lower.tail = TRUE, log.p = FALSE)
## [1] 31505.6

95% confidence interval from the empirical data, assuming normality

qnorm(0.95,LotArea.mean, LotArea.sd)
## [1] 26934.55

Empirical 5th percentile and 95th percentile of the data

quantile(train$LotArea, c(.05, .95))
##       5%      95% 
##  3311.70 17401.15

What does these values mean?

This can help us recognize the differences between a exponential equation and the selected right-skewed data. The approximation can work to fit different models and help explain the data in a more easy fashion.

other numeric plots

Here’s a way to all numeric plots in order to examine which variables are going to be variable to work with, as we will see in the next section, the regression method will tell what are the best variables to apply.

library(moments)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
intPlot <-function(v, name, target){
  plot(density(na.omit(v)), main=paste(name,"(Density)"))
  polygon(density(na.omit(v)), col="red", border="blue")
  plot(target, v, pch=21,  main=paste(name," vs Response"))
  qqnorm(v,main=paste(name,"Q-Q Plot"))
}

plots <- function (v, target){
  par(mfrow=c(2,4))
  i<-0
  for(j in names(v)){
    i<-i+1
    if(sapply(v[colnames(v)[i] ], class)=='integer'){
      intPlot(v[,i],colnames(v)[i],target)
    }
  }
}
#print plots for all variables that are numeric.
plots(train, train$SalePrice)

Model Selection / Regression

I have used a stepwise procedure initially, in which i used a linear regression model for the lower formula and a binomial regression model for the upper formula, the result we much better than the linear formula alone.

Now I’m using the Random Forest Method to predict the salePrice result and get better results, this algorithm is a bit heavy but the prediction is much more effective.

#select only numeric values 
normalize <- function(data){
  subset <- select_if(data, is.numeric)
  subset[is.na(subset)] <- 0
  subset <- subset[complete.cases(subset),]
  return(subset)
}


trainMod <- normalize(train)
testMod <- normalize(test)
null <- lm(SalePrice~1, trainMod)
all <- glm(as.factor(SalePrice) ~ LotArea+GrLivArea, data=trainMod, family=binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
stepResults <- step(null, scope = list(lower = null, upper = all), direction = "both",trace = 0)

rfFit <-train(SalePrice ~.,
              data=trainMod,
              method="rf",
              trControl=trainControl(method="oob",number=5),
              prox=TRUE, importance = TRUE,
              allowParallel=TRUE)

# show the model summary          
rfFit
## Random Forest 
## 
## 1460 samples
##   37 predictor
## 
## No pre-processing
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared 
##    2    31976.41  0.8378746
##   19    29207.33  0.8647381
##   37    29618.86  0.8608996
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 19.
# display the variables determined to be the most relevant
dotPlot(varImp(rfFit), main = "Random Forest Model - Most Relevant Variables")

# predict              

#append scored data

result <- data.frame('Id' = testMod$Id,'SalePrice' = predict(rfFit, testMod))
result$SalePrice[result$SalePrice<0] <- 0

plot(density(trainMod$SalePrice))

plot(density(na.omit(result$SalePrice)))

Save the results to send to Kaggle

write.csv(result, file = "results_for_kaggle.csv",row.names=FALSE)

When using ‘cv’ as the trainControl method: > Username: Delagroove > Score: 0.14829

When using ‘oob’ as the TrainControl method: > Username: Delagroove > Score: 0.14813