suppressWarnings(require(dplyr))
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
suppressWarnings(require(ggplot2))
## Loading required package: ggplot2
suppressWarnings(require(moments))
## Loading required package: moments
suppressWarnings(require(psych))
## Loading required package: psych
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
suppressWarnings(require(MASS))
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
url <- "https://raw.githubusercontent.com/andrewlulyj/Data-605/master/train.csv"
data = read.csv(url, header = TRUE)
summary(data$TotalBsmtSF)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 795.8 991.5 1057.4 1298.2 6110.0
skewness(data$TotalBsmtSF)
## [1] 1.522688
ggplot(data = data, aes(x = TotalBsmtSF) )+ geom_density(alpha = .2, fill = "003333")
According to the plot and skewness result, TotalBsmtSF is a right skew variable. So it will be x and since the is a house price research, I will pick SalePrice as y
Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 1st quartile of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities. In addition, make a table of counts as shown below.
x_Q1 <- quantile(data$TotalBsmtSF, 0.25)
y_Q1 <- quantile(data$SalePrice, 0.25)
numerator <- nrow(filter(data, SalePrice > y_Q1 & TotalBsmtSF > x_Q1)) /nrow(data)
denominator <- nrow(filter(data, SalePrice > y_Q1 ))/nrow(data)
p1<-numerator/denominator
p1
## [1] 0.830137
p2<-nrow(filter(data, SalePrice > y_Q1 & TotalBsmtSF > x_Q1)) /nrow(data)
p2
## [1] 0.6226027
numerator <- nrow(filter(data, SalePrice > y_Q1 & TotalBsmtSF < x_Q1)) /nrow(data)
denominator <- nrow(filter(data, SalePrice > y_Q1 ))/nrow(data)
p3<-numerator/denominator
p3
## [1] 0.169863
a <- nrow(filter(data, SalePrice <= y_Q1 & TotalBsmtSF <= x_Q1))
b<- nrow(filter(data, SalePrice > y_Q1 & TotalBsmtSF <= x_Q1))
c <- nrow(filter(data, SalePrice <= y_Q1 & TotalBsmtSF > x_Q1))
d<- nrow(filter(data, SalePrice > y_Q1 & TotalBsmtSF > x_Q1))
table <- matrix(c(a,b,a+b,c,d,c + d ,a+c,b+d,a+b+c+d),ncol=3, nrow=3,byrow=TRUE)
colnames(table) <- c("<=1d quartile",">1d quartile","Total")
rownames(table) <- c('<=1d quartile', '>1d quartile','Total')
table.table <- as.table(table)
table.table
## <=1d quartile >1d quartile Total
## <=1d quartile 179 186 365
## >1d quartile 186 909 1095
## Total 365 1095 1460
Does splitting the training data in this fashion make them independent? Let A be the new variable counting those observations above the 1st quartile for X, and let B be the new variable counting those observations above the 1st quartile for Y. Does P(AB)=P(A)P(B)? Check mathematically, and then evaluate by running a Chi Square test for association.
B<- nrow(filter(data, SalePrice > y_Q1))
A<- nrow(filter(data, TotalBsmtSF > x_Q1))
PA <- A/nrow(data)
PB <-B/nrow(data)
PAB <-nrow(filter(data, SalePrice > y_Q1 & TotalBsmtSF > x_Q1))/nrow(data)
PAB
## [1] 0.6226027
PA*PB
## [1] 0.5625
test <- matrix(c(176, 186, 186, 909), 2, 2, byrow=T)
chisq.test(test, correct=TRUE)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: test
## X-squared = 144.1, df = 1, p-value < 2.2e-16
P(A|B)!=P(A)P(B)P(A|B)!=P(A)P(B) so X and Y are not independent. Based on chi square test, P is less than 0.05, so reject null hypothesis that x and y are independent
Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot of X and Y. Derive a correlation matrix for any THREE quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide a 92% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?
plot(data$SalePrice~ data$TotalBsmtSF)
cor<-cor(data[, which(names(data) %in% c("TotalBsmtSF", "SalePrice","LotArea"))])
cor
## LotArea TotalBsmtSF SalePrice
## LotArea 1.0000000 0.2608331 0.2638434
## TotalBsmtSF 0.2608331 1.0000000 0.6135806
## SalePrice 0.2638434 0.6135806 1.0000000
t1<-cor.test(data$SalePrice,data$TotalBsmtSF, method = "pearson" ,conf.level=0.92)
t2<-cor.test(data$SalePrice,data$LotArea, method = "pearson" ,conf.level=0.92)
t3<-cor.test(data$TotalBsmtSF,data$LotArea, method = "pearson" ,conf.level=0.92)
t1
##
## Pearson's product-moment correlation
##
## data: data$SalePrice and data$TotalBsmtSF
## t = 29.671, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
## 0.5841762 0.6413763
## sample estimates:
## cor
## 0.6135806
t2
##
## Pearson's product-moment correlation
##
## data: data$SalePrice and data$LotArea
## t = 10.445, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
## 0.2206794 0.3059759
## sample estimates:
## cor
## 0.2638434
t3
##
## Pearson's product-moment correlation
##
## data: data$TotalBsmtSF and data$LotArea
## t = 10.317, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
## 0.2176019 0.3030429
## sample estimates:
## cor
## 0.2608331
for all three test has p value less than 0.05 which indicate that all three varible has dependence between each other
Invert your 3 x 3 correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. decomposition on the matrix.
precmatrix <- solve(cor)
precmatrix
## LotArea TotalBsmtSF SalePrice
## LotArea 1.0932718 -0.1734874 -0.1820040
## TotalBsmtSF -0.1734874 1.6313307 -0.9551793
## SalePrice -0.1820040 -0.9551793 1.6341000
cor %*% precmatrix
## LotArea TotalBsmtSF SalePrice
## LotArea 1.000000e+00 0 0.000000e+00
## TotalBsmtSF 0.000000e+00 1 2.220446e-16
## SalePrice -2.775558e-17 0 1.000000e+00
precmatrix %*% cor
## LotArea TotalBsmtSF SalePrice
## LotArea 1.000000e+00 -1.387779e-17 -5.551115e-17
## TotalBsmtSF 5.551115e-17 1.000000e+00 2.220446e-16
## SalePrice -5.551115e-17 0.000000e+00 1.000000e+00
suppressWarnings(suppressMessages(library(FactoMineR)))
PCA(data[, which(names(data) %in% c("TotalBsmtSF", "SalePrice","LotArea"))], scale.unit=TRUE, ncp=5, graph=T)
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 1460 individuals, described by 3 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
Many times, it makes sense to fit a closed form distribution to data. For the first variable that you selected which is skewed to the right, shift it so that the minimum value is above zero as necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R- devel/library/MASS/html/fitdistr.html ). Find the optimal value of λ for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, λ)). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5 th and 95 th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5 th percentile and 95 th percentile of the data. Discuss.
suppressWarnings(suppressMessages(library(MASS)))
suppressWarnings(suppressMessages(library(Rmisc)))
TotalBsmtSF <- data$TotalBsmtSF + 0.0000001
model <- fitdistr(TotalBsmtSF, "exponential")
(lambda <- model$estimate)
## rate
## 0.0009456896
sample <- rexp(1000, lambda)
hist(sample)
hist(TotalBsmtSF)
cdf_5 <- log(1 - .05)/-lambda
cdf_95 <- log(1 - .95)/-lambda
obs_5 <- quantile(data$TotalBsmtSF, 0.05)
obs_95 <- quantile(data$TotalBsmtSF, 0.95)
CI(data$TotalBsmtSF, 0.95)
## upper mean lower
## 1079.951 1057.429 1034.908
The original plot is very close to exponential
Build some type of regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.
suppressWarnings(suppressMessages(library(caret)))
suppressWarnings(suppressMessages(library(randomForest)))
numeric_var <- names(data)[which(sapply(data, is.numeric))]
house.train <- data[numeric_var]
house.train[is.na(house.train)] <- 0
# create the test dataset, limited to numeric variables
numeric_var <- names(test)[which(sapply(test, is.numeric))]
house.test <- test[numeric_var]
house.test[is.na(house.test)] <- 0
model <-train(SalePrice ~.,
data=house.train,
method="rf",
trControl=trainControl(method="cv",number=5),
prox=TRUE, importance = TRUE,
allowParallel=TRUE)
# show the model summary
model
## Random Forest
##
## 1460 samples
## 37 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 1168, 1166, 1169, 1169, 1168
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 33102.51 0.8459101 18895.12
## 19 31229.53 0.8494069 17639.19
## 37 32507.35 0.8366584 18523.13
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 19.
dotPlot(varImp(model), main = "Random Forest Model")
# predict
pred_rf <- predict(model, house.train)
submission <- as.data.frame(cbind(data$Id, pred_rf))
colnames(submission) <- c("Id", "SalePrice")
dim(submission)
## [1] 1460 2