suppressWarnings(require(dplyr))
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
suppressWarnings(require(ggplot2))
## Loading required package: ggplot2
suppressWarnings(require(moments))
## Loading required package: moments
suppressWarnings(require(psych))
## Loading required package: psych
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
suppressWarnings(require(MASS))
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select

Pick X, Y varible

url <- "https://raw.githubusercontent.com/andrewlulyj/Data-605/master/train.csv"
data = read.csv(url, header = TRUE)
summary(data$TotalBsmtSF)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   795.8   991.5  1057.4  1298.2  6110.0
skewness(data$TotalBsmtSF)
## [1] 1.522688
ggplot(data = data, aes(x = TotalBsmtSF) )+ geom_density(alpha = .2, fill = "003333")

According to the plot and skewness result, TotalBsmtSF is a right skew variable. So it will be x and since the is a house price research, I will pick SalePrice as y

Calaculate Probablity

Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 1st quartile of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities. In addition, make a table of counts as shown below.

a P(X>x|Y>y)

x_Q1 <- quantile(data$TotalBsmtSF, 0.25)
y_Q1 <- quantile(data$SalePrice, 0.25)
numerator <- nrow(filter(data, SalePrice > y_Q1 & TotalBsmtSF > x_Q1)) /nrow(data)
denominator <- nrow(filter(data, SalePrice > y_Q1 ))/nrow(data)
p1<-numerator/denominator
p1
## [1] 0.830137

b P(X>x,Y>y)

p2<-nrow(filter(data, SalePrice > y_Q1 & TotalBsmtSF > x_Q1)) /nrow(data)
p2
## [1] 0.6226027

c. P(Xy)

numerator <- nrow(filter(data, SalePrice > y_Q1 & TotalBsmtSF < x_Q1)) /nrow(data)
denominator <- nrow(filter(data, SalePrice > y_Q1 ))/nrow(data)
p3<-numerator/denominator
p3
## [1] 0.169863
a <- nrow(filter(data, SalePrice <= y_Q1 & TotalBsmtSF <= x_Q1))
b<- nrow(filter(data, SalePrice > y_Q1 & TotalBsmtSF <= x_Q1))
c <- nrow(filter(data, SalePrice <= y_Q1 & TotalBsmtSF > x_Q1))
d<- nrow(filter(data, SalePrice > y_Q1 & TotalBsmtSF > x_Q1))

table <- matrix(c(a,b,a+b,c,d,c + d ,a+c,b+d,a+b+c+d),ncol=3, nrow=3,byrow=TRUE)
colnames(table) <- c("<=1d quartile",">1d quartile","Total")
rownames(table) <- c('<=1d quartile', '>1d quartile','Total')
table.table <- as.table(table)
table.table
##               <=1d quartile >1d quartile Total
## <=1d quartile           179          186   365
## >1d quartile            186          909  1095
## Total                   365         1095  1460

Independence

Does splitting the training data in this fashion make them independent? Let A be the new variable counting those observations above the 1st quartile for X, and let B be the new variable counting those observations above the 1st quartile for Y. Does P(AB)=P(A)P(B)? Check mathematically, and then evaluate by running a Chi Square test for association.

B<- nrow(filter(data, SalePrice > y_Q1))
A<- nrow(filter(data, TotalBsmtSF > x_Q1))
PA <- A/nrow(data)
PB <-B/nrow(data)
PAB <-nrow(filter(data, SalePrice > y_Q1 & TotalBsmtSF > x_Q1))/nrow(data)
PAB
## [1] 0.6226027
PA*PB
## [1] 0.5625
test <- matrix(c(176, 186, 186, 909), 2, 2, byrow=T) 

chisq.test(test, correct=TRUE) 
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  test
## X-squared = 144.1, df = 1, p-value < 2.2e-16

P(A|B)!=P(A)P(B)P(A|B)!=P(A)P(B) so X and Y are not independent. Based on chi square test, P is less than 0.05, so reject null hypothesis that x and y are independent

Descriptive and Inferential Statistics.

Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot of X and Y. Derive a correlation matrix for any THREE quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide a 92% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

plot(data$SalePrice~ data$TotalBsmtSF)

cor<-cor(data[, which(names(data) %in% c("TotalBsmtSF", "SalePrice","LotArea"))])
cor
##               LotArea TotalBsmtSF SalePrice
## LotArea     1.0000000   0.2608331 0.2638434
## TotalBsmtSF 0.2608331   1.0000000 0.6135806
## SalePrice   0.2638434   0.6135806 1.0000000
t1<-cor.test(data$SalePrice,data$TotalBsmtSF, method = "pearson" ,conf.level=0.92)
t2<-cor.test(data$SalePrice,data$LotArea, method = "pearson" ,conf.level=0.92)
t3<-cor.test(data$TotalBsmtSF,data$LotArea, method = "pearson" ,conf.level=0.92)
t1
## 
##  Pearson's product-moment correlation
## 
## data:  data$SalePrice and data$TotalBsmtSF
## t = 29.671, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
##  0.5841762 0.6413763
## sample estimates:
##       cor 
## 0.6135806
t2
## 
##  Pearson's product-moment correlation
## 
## data:  data$SalePrice and data$LotArea
## t = 10.445, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
##  0.2206794 0.3059759
## sample estimates:
##       cor 
## 0.2638434
t3
## 
##  Pearson's product-moment correlation
## 
## data:  data$TotalBsmtSF and data$LotArea
## t = 10.317, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
##  0.2176019 0.3030429
## sample estimates:
##       cor 
## 0.2608331

for all three test has p value less than 0.05 which indicate that all three varible has dependence between each other

Linear Algebra and Correlation.

Invert your 3 x 3 correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. decomposition on the matrix.

precmatrix <- solve(cor)
precmatrix
##                LotArea TotalBsmtSF  SalePrice
## LotArea      1.0932718  -0.1734874 -0.1820040
## TotalBsmtSF -0.1734874   1.6313307 -0.9551793
## SalePrice   -0.1820040  -0.9551793  1.6341000
cor %*% precmatrix
##                   LotArea TotalBsmtSF    SalePrice
## LotArea      1.000000e+00           0 0.000000e+00
## TotalBsmtSF  0.000000e+00           1 2.220446e-16
## SalePrice   -2.775558e-17           0 1.000000e+00
precmatrix %*% cor
##                   LotArea   TotalBsmtSF     SalePrice
## LotArea      1.000000e+00 -1.387779e-17 -5.551115e-17
## TotalBsmtSF  5.551115e-17  1.000000e+00  2.220446e-16
## SalePrice   -5.551115e-17  0.000000e+00  1.000000e+00

decomposition on the matrix.

suppressWarnings(suppressMessages(library(FactoMineR)))
PCA(data[, which(names(data) %in% c("TotalBsmtSF", "SalePrice","LotArea"))], scale.unit=TRUE, ncp=5, graph=T)

## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 1460 individuals, described by 3 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"

Calculus-Based Probability & Statistics.

Many times, it makes sense to fit a closed form distribution to data. For the first variable that you selected which is skewed to the right, shift it so that the minimum value is above zero as necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R- devel/library/MASS/html/fitdistr.html ). Find the optimal value of λ for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, λ)). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5 th and 95 th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5 th percentile and 95 th percentile of the data. Discuss.

suppressWarnings(suppressMessages(library(MASS)))
suppressWarnings(suppressMessages(library(Rmisc)))
TotalBsmtSF <- data$TotalBsmtSF + 0.0000001
model <- fitdistr(TotalBsmtSF, "exponential")
(lambda <- model$estimate)
##         rate 
## 0.0009456896
sample <- rexp(1000, lambda)
hist(sample)

hist(TotalBsmtSF)

cdf_5 <- log(1 - .05)/-lambda
cdf_95 <- log(1 - .95)/-lambda
obs_5 <- quantile(data$TotalBsmtSF, 0.05)
obs_95 <- quantile(data$TotalBsmtSF, 0.95)
CI(data$TotalBsmtSF, 0.95)
##    upper     mean    lower 
## 1079.951 1057.429 1034.908

The original plot is very close to exponential

Modeling

Build some type of regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.

suppressWarnings(suppressMessages(library(caret)))
suppressWarnings(suppressMessages(library(randomForest)))
numeric_var <- names(data)[which(sapply(data, is.numeric))]
house.train <- data[numeric_var]
house.train[is.na(house.train)] <- 0

# create the test dataset, limited to numeric variables
numeric_var <- names(test)[which(sapply(test, is.numeric))]
house.test <- test[numeric_var]
house.test[is.na(house.test)] <- 0

model <-train(SalePrice ~.,
              data=house.train,
              method="rf",
              trControl=trainControl(method="cv",number=5),
              prox=TRUE, importance = TRUE,
              allowParallel=TRUE)

# show the model summary          
model
## Random Forest 
## 
## 1460 samples
##   37 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 1168, 1166, 1169, 1169, 1168 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE     
##    2    33102.51  0.8459101  18895.12
##   19    31229.53  0.8494069  17639.19
##   37    32507.35  0.8366584  18523.13
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 19.
dotPlot(varImp(model), main = "Random Forest Model")

# predict              
pred_rf <- predict(model, house.train)

submission <- as.data.frame(cbind(data$Id, pred_rf))
colnames(submission) <- c("Id", "SalePrice")

dim(submission) 
## [1] 1460    2