library(ggplot2)
library(moments)
library(psych)##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(MASS)
library(caret)## Loading required package: lattice
train = read.csv('math_final_train.csv', header = TRUE)
test = read.csv('math_final_test.csv', header = TRUE)
ggplot(data = train, aes(x = LotArea) )+
geom_density(alpha = .2, fill = "003333")+
ggtitle("Lot Area")I am picking LotArea as the X variable and SalePrice as the Y variable.
summary(train$SalePrice)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34900 129975 163000 180921 214000 755000
Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 1st quartile of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities. In addition, make a table of counts as shown below.
The meaning of all these probabilities: a) means the probability of the LotArea being bigger tham the first quartile, given the SalePrice is bigger than the first Quartile.
means that the probability of both scenarios happening.
means the probability that the lotArea is smaller than the 1st quartile given that the SalePrice is bigger than the first quartile.
#Assign quartile values to variables.
XQ1<-quantile(train$LotArea, 0.25)
YQ1<-quantile(train$SalePrice, 0.25)
#Create subsets of data based on quartile operators.
yY <- subset(train,SalePrice <= YQ1)
Yy <- subset(train,SalePrice > YQ1)
Xx_Yy<- subset(Yy, LotArea > XQ1)
Xx_yY<- subset(yY, LotArea > XQ1)
xX_Yy<- subset(Yy, LotArea <= XQ1)
xX_yY<- subset(yY, LotArea <= XQ1)#for P(X>x|Y>y)
a <- nrow(Xx_Yy)
nrow(Xx_Yy)/nrow(train)## [1] 0.6150685
#for P(X>x|y<Y)
b <- nrow(Xx_yY)
nrow(Xx_yY)/nrow(train)## [1] 0.1349315
#for P(X<x|Y>y)
c <- nrow(xX_Yy)
nrow(xX_Yy)/nrow(train)## [1] 0.1349315
#P(X<x|y<Y)
d <-nrow(xX_yY)
nrow(xX_yY)/nrow(train)## [1] 0.1150685
table <- matrix(c(d,c,(d+c),b,a,(b+a),(b+d),(c+a),(a+b+c+d)),ncol=3, nrow=3,byrow=TRUE)
colnames(table) <- c("<=1Q", ">1Q", "Total")
rownames(table) <- c('<=1Q', '>1Q', 'Total')
result_table <- as.table(table)
result_table## <=1Q >1Q Total
## <=1Q 168 197 365
## >1Q 197 898 1095
## Total 365 1095 1460
No, independence describes whether there is a relation between X & Y. Splitting the data doesn’t change the relationship, it just changes the extent of problem domain.
Let A be the new variable counting those observations above the 3d quartile for X, and let B be the new variable counting those observations for the 2d quartile for Y. Does P(A|B)=P(A)P(B)? Check mathematically, and then evaluate by running a Chi Square test for association.
#Observations above the 1d quartile for X
Xx_A <- subset(train, LotArea >= XQ1)
#Observations for the 1d quartile for Y
YQ1 <- quantile(train$SalePrice, 0.25)
Yy_B_1 <- subset(train, SalePrice <= YQ1)
YY_B_2 <- subset(train, SalePrice >= YQ1)#P(A|B)
YY_XX <- subset(YY_B_2, LotArea >= XQ1)
res1 <- nrow(YY_XX)/nrow(train)
#P(A)P(B)
res2 <- (nrow(Xx_A)/nrow(train))*nrow(YY_B_2)/nrow(train)
print(c("P(A|B)=P(A)P(B)?: ", res1==res2))## [1] "P(A|B)=P(A)P(B)?: " "FALSE"
The variables are not independent as described below.
chi_table<- table(train$LotArea, train$SalePrice)
suppressWarnings(chisq.test(chi_table))##
## Pearson's Chi-squared test
##
## data: chi_table
## X-squared = 735090, df = 709660, p-value < 2.2e-16
##
## Pearson's Chi-squared test
##
## data: chi_table
## X-squared = 944670, df = 921920, p-value < 2.2e-16We recieve a p value of < 2.2e-16. Therefore, we reject the null hypothesis that X is Independent of Y.
#Collect summary statistics
LotArea.mean <-mean(train$LotArea)
LotArea.median <-median(train$LotArea)
LotArea.mode <- as.numeric(names(sort(-table(train$LotArea))))[1]
LotArea.sd <- sd(train$LotArea)
SalePrice.mean <-mean(train$SalePrice)
SalePrice.median <-median(train$SalePrice)
SalePrice.mode <- as.numeric(names(sort(-table(train$SalePrice))))[1]
SalePrice.sd <- sd(train$SalePrice)
#Create density plot for LotArea variable
d_LotArea <- density(train$LotArea)
plot(d_LotArea, main="LotArea Probabilities", ylab="Probability", xlab="LotArea")
polygon(d_LotArea, col="red")
abline(v = LotArea.median, col = "green", lwd = 2)
abline(v = LotArea.mean, col = "blue", lwd = 2)
abline(v = LotArea.mode, col = "purple", lwd = 2)
legend("topright", legend=c("median", "mean","mode"),col=c("green", "blue", "purple"), lty=1, cex=0.8)#Create density plot for SalePrice variable.
SalePrice_for_graph <- density(train$SalePrice, na.rm=TRUE)
plot(SalePrice_for_graph, main="SalePrice Probabilities", ylab="Probability", xlab="SalePrice")
polygon(SalePrice_for_graph, col="red")
abline(v = SalePrice.median, col = "green", lwd = 2)
abline(v = SalePrice.mean, col = "blue", lwd = 2)
abline(v = SalePrice.mode, col = "purple", lwd = 2)
legend("topright", legend=c("median", "mean","mode"),col=c("green", "blue", "purple"), lty=1, cex=0.8 )plot(train$LotArea,train$SalePrice, main="LotArea vs SalePrice Scatterplot",
xlab="LotArea", ylab="SalePrice", pch=3)Provide a 95% Confidence Interval for the difference in the mean of the variables.
t.test(train$LotArea,train$SalePrice)##
## Welch Two Sample t-test
##
## data: train$LotArea and train$SalePrice
## t = -81.321, df = 1505.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -174514.7 -166294.1
## sample estimates:
## mean of x mean of y
## 10516.83 180921.20
Again, since the p value is too low, we reject the hypothesis that the difference in means is equal to 0.
Derive a correlation matrix for THREE of the quantitative variables you selected.
corMatrix<-cor(train[, which(names(train) %in% c("LotArea", "SalePrice", 'GrLivArea'))])
corMatrix## LotArea GrLivArea SalePrice
## LotArea 1.0000000 0.2631162 0.2638434
## GrLivArea 0.2631162 1.0000000 0.7086245
## SalePrice 0.2638434 0.7086245 1.0000000
These results show a very low but possible positive correlation between the data of 0.2638, but the correlation between GrLivArea and SalePrice ssems to be high at 0.7086245
Test the hypothesis that the correlation between these variables is 0 and provide a 92% confidence interval.
t.test(train$LotArea,train$SalePrice, conf.level=0.92)##
## Welch Two Sample t-test
##
## data: train$LotArea and train$SalePrice
## t = -81.321, df = 1505.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 92 percent confidence interval:
## -174075.3 -166733.4
## sample estimates:
## mean of x mean of y
## 10516.83 180921.20
t.test(train$LotArea,train$GrLivArea, conf.level=0.92)##
## Welch Two Sample t-test
##
## data: train$LotArea and train$GrLivArea
## t = 34.411, df = 1467.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 92 percent confidence interval:
## 8543.097 9459.632
## sample estimates:
## mean of x mean of y
## 10516.828 1515.464
t.test(train$GrLivArea,train$SalePrice, conf.level=0.92)##
## Welch Two Sample t-test
##
## data: train$GrLivArea and train$SalePrice
## t = -86.288, df = 1459.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 92 percent confidence interval:
## -183048.2 -175763.3
## sample estimates:
## mean of x mean of y
## 1515.464 180921.196
The difference for the 3 variables’ means is not 0 in all cases.
The correlation values are not too high so we don’t have to worry about misidentifying the outcomes. The meaning of this result is that the variables do not say much about the final sale price.
Invert your correlation matrix (Precision matrix).
corMatrixInverse <- ginv(corMatrix)
corMatrixInverse## [,1] [,2] [,3]
## [1,] 1.0884485 -0.1664868 -0.1692033
## [2,] -0.1664868 2.0340972 -1.3974846
## [3,] -0.1692033 -1.3974846 2.0349350
The diagonal elements represent variance inflation factors, which measures the relationship between combinations between variables.
This is the identity Matrix
matrix1<- corMatrixInverse %*% corMatrix
matrix1## LotArea GrLivArea SalePrice
## [1,] 1.000000e+00 -2.465530e-16 -8.326673e-17
## [2,] -2.311479e-16 1.000000e+00 -2.220446e-16
## [3,] -1.110223e-16 2.220446e-16 1.000000e+00
This represents how matrix products differ depending on the order or direction by which they are multiplied.
matrix2<- corMatrix %*% corMatrixInverse
matrix2## [,1] [,2] [,3]
## LotArea 1.000000e+00 -5.580671e-16 1.110223e-16
## GrLivArea -3.848319e-17 1.000000e+00 -2.220446e-16
## SalePrice -8.326673e-17 0.000000e+00 1.000000e+00
#Perform a log transform on each variable to normalize
dataCopy<-train
dataCopy$LotArea<-log(dataCopy$LotArea)
dataCopy$SalePrice<-log(dataCopy$SalePrice)
dataCopy$GrLivArea<-log(dataCopy$GrLivArea)
#apply PCA and ADD additional parameters for a more interesting interpretation
data.pca<-prcomp(dataCopy[, which(names(dataCopy) %in% c("LotArea", "SalePrice", "GrLivArea"))],center = TRUE,scale = TRUE)
data.pca## Standard deviations (1, .., p=3):
## [1] 1.4246936 0.8370813 0.5191755
##
## Rotation (n x k) = (3 x 3):
## PC1 PC2 PC3
## LotArea 0.4746802 0.8799375 0.01971454
## GrLivArea 0.6204099 -0.3503987 0.70164972
## SalePrice 0.6243159 -0.3208281 -0.71224926
summary(data.pca)## Importance of components:
## PC1 PC2 PC3
## Standard deviation 1.4247 0.8371 0.51918
## Proportion of Variance 0.6766 0.2336 0.08985
## Cumulative Proportion 0.6766 0.9102 1.00000
biplot(data.pca)Vectors that point in the same direction correspond to variables that have similar response profiles, and can be interpreted as having similar meaning in the context set by the data, we see here that SalePrice and GrLivArea have very similar vectors pointing to the same direction. that will be corroborated later on the regression technique.
screeplot(data.pca, type="lines")We take the LotArea data and fit it to an exponential function.
lambda<-fitdistr(train$LotArea,"exponential")
lambda$estimate## rate
## 9.50857e-05
#Transpose the rate into 1000 selected variables as an exponential distribution
pdf_distr<-rexp(1000, lambda$estimate)
#Plot the results of the exponential distribution
hist(pdf_distr, freq = FALSE, breaks = 100, main ="Fitted Exponential PDF with LotArea", xlim = c(1, quantile(pdf_distr, 0.99)))
curve(dexp(x, rate = lambda$estimate), col = "green", add = TRUE)#Plot the results as compared to the original data
hist(train$LotArea, freq = FALSE, breaks = 100, main ="Exponential VS original LotArea data",xlim = c(1, quantile(train$LotArea, 0.99)))
curve(dexp(x, rate = lambda$estimate), col = "green", add = TRUE)With the exponential PDF:
5th and 95th percentiles using the cumulative distribution function (CDF)
qexp(0.05, rate = lambda$estimate, lower.tail = TRUE, log.p = FALSE)## [1] 539.4428
qexp(0.95, rate = lambda$estimate, lower.tail = TRUE, log.p = FALSE)## [1] 31505.6
95% confidence interval from the empirical data, assuming normality
qnorm(0.95,LotArea.mean, LotArea.sd)## [1] 26934.55
Empirical 5th percentile and 95th percentile of the data
quantile(train$LotArea, c(.05, .95))## 5% 95%
## 3311.70 17401.15
This can help us recognize the differences between a exponential equation and the selected right-skewed data. The approximation can work to fit different models and help explain the data in a more easy fashion.
Here’s a way to all numeric plots in order to examine which variables are going to be variable to work with, as we will see in the next section, the regression method will tell what are the best variables to apply.
library(moments)
library(dplyr)##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
intPlot <-function(v, name, target){
plot(density(na.omit(v)), main=paste(name,"(Density)"))
polygon(density(na.omit(v)), col="red", border="blue")
plot(target, v, pch=21, main=paste(name," vs Response"))
qqnorm(v,main=paste(name,"Q-Q Plot"))
}
plots <- function (v, target){
par(mfrow=c(2,4))
i<-0
for(j in names(v)){
i<-i+1
if(sapply(v[colnames(v)[i] ], class)=='integer'){
intPlot(v[,i],colnames(v)[i],target)
}
}
}#print plots for all variables that are numeric.
plots(train, train$SalePrice)I have used a stepwise procedure initially, in which i used a linear regression model for the lower formula and a binomial regression model for the upper formula, the result we much better than the linear formula alone.
Now I’m using the Random Forest Method to predict the salePrice result and get better results, this algorithm is a bit heavy but the prediction is much more effective.
#select only numeric values
normalize <- function(data){
subset <- select_if(data, is.numeric)
subset[is.na(subset)] <- 0
subset <- subset[complete.cases(subset),]
return(subset)
}
trainMod <- normalize(train)
testMod <- normalize(test)
null <- lm(SalePrice~1, trainMod)
all <- glm(as.factor(SalePrice) ~ LotArea+GrLivArea, data=trainMod, family=binomial)## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
stepResults <- step(null, scope = list(lower = null, upper = all), direction = "both",trace = 0)
rfFit <-train(SalePrice ~.,
data=trainMod,
method="rf",
trControl=trainControl(method="oob",number=5),
prox=TRUE, importance = TRUE,
allowParallel=TRUE)
# show the model summary
rfFit## Random Forest
##
## 1460 samples
## 37 predictor
##
## No pre-processing
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared
## 2 31976.41 0.8378746
## 19 29207.33 0.8647381
## 37 29618.86 0.8608996
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 19.
# display the variables determined to be the most relevant
dotPlot(varImp(rfFit), main = "Random Forest Model - Most Relevant Variables")# predict
#append scored data
result <- data.frame('Id' = testMod$Id,'SalePrice' = predict(rfFit, testMod))
result$SalePrice[result$SalePrice<0] <- 0
plot(density(trainMod$SalePrice))plot(density(na.omit(result$SalePrice)))write.csv(result, file = "results_for_kaggle.csv",row.names=FALSE)When using ‘cv’ as the trainControl method: > Username: Delagroove > Score: 0.14829
When using ‘oob’ as the TrainControl method: > Username: Delagroove > Score: 0.14813