DATA 605 - Final Project

Required Libraries

suppressWarnings(require(ggplot2))
suppressWarnings(require(moments))
suppressWarnings(require(psych))
suppressWarnings(require(MASS))

Relevant Links

Pick one of the quantitative independent variables from the training data set (train.csv), and define that variable as X. Make sure this variable is skewed to the right! Pick the dependent variable and define it as Y.

#Reading the data
data = read.csv("https://raw.githubusercontent.com/RobertSellers/605_MATH/master/final/train.csv", header = TRUE)

# Finding a right skewed variable X
skewness(data$GrLivArea)

## [1] 1.365156

ggplot(data = data, aes(x = GrLivArea) )+ 
  geom_density(alpha = .2, fill = "003333")+
  ggtitle("Above grade (ground) living area square feet")

# Picking a dependent variable Y
summary(data$LotArea)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1300    7554    9478   10520   11600  215200

Probability

Calculate as a minimum the below probabilities a through d. Assume the small letter “x” is estimated as the 3d quartile of the X variable, and the small letter “y” is estimated as the 2d quartile of the Y variable. Interpret the meaning of all probabilities. In addition, make a table of counts as shown below.

P(X>x, Y>y)
P(X>x, Y>y)
P(Xy)
P(X<x, y<Y)

#Assign quartile values to variables.
XQ3<-quantile(data$GrLivArea, 0.75)
YQ2<-quantile(data$LotArea, 0.50)

#Create subsets of data based on quartile operators.
yY <- subset(data,LotArea <= YQ2)
Yy <- subset(data,LotArea > YQ2)
XxYy<- subset(Yy, GrLivArea > XQ3)
XxyY<- subset(yY, GrLivArea > XQ3)
xXYy<- subset(Yy, GrLivArea <= XQ3)
xXyY<- subset(yY, GrLivArea <= XQ3)

#P(X>x|Y>y)
a<-nrow(XxYy)
nrow(XxYy)/nrow(data)

## [1] 0.1993151

#P(X>x|y<Y)
b<-nrow(XxyY)
nrow(XxyY)/nrow(data)

## [1] 0.05068493

#P(X<x|Y>y)
c<-nrow(xXYy)
nrow(xXYy)/nrow(data)

## [1] 0.3006849

#P(X<x|y<Y)
d<-nrow(xXyY)
nrow(xXyY)/nrow(data)

## [1] 0.4493151

#Count and summarize the count data into a table. 
values <- matrix(c(d,c,(d+c),b,a,(b+a),(b+d),(c+a),(a+b+c+d)),ncol=3, nrow=3,byrow=TRUE)
colnames(values) <- c("<=2d quartile",">2d quartile","Total")
rownames(values) <- c('<=3d quartile', '>3d quartile','Total')
values.table <- as.table(values)
values.table

##               <=2d quartile >2d quartile Total
## <=3d quartile           656          439  1095
## >3d quartile             74          291   365
## Total                   730          730  1460

Does splitting the training data in this fashion make them independent?

The data is independent when tabulated in this way. There is no method used that shows variable X’s influence on variable Y.

Let A be the new variable counting those observations above the 3d quartile for X, and let B be the new variable counting those observations for the 2d quartile for Y. Does P(A|B)=P(A)P(B)? Check mathematically, and then evaluate by running a Chi Square test for association.

#Observations above the 3d quartile for X
Xx_A <- subset(data, GrLivArea >= XQ3)

#Observations for the 2d quartile for Y
YQ3<-quantile(data$LotArea, 0.75)
Yy_B_1 <- subset(data, LotArea <= YQ3)
YY_B_2 <- subset(Yy_B_1, LotArea >= YQ2)

Does P(A|B)=P(A)P(B)?

No. See below.

#P(A|B)
YY_XX <- subset(YY_B_2, GrLivArea >= XQ3)
nrow(YY_XX)/nrow(data)

## [1] 0.07876712

#P(A)P(B)
(nrow(Xx_A)/nrow(data))*nrow(YY_B_2)/nrow(data)

## [1] 0.0625

Chi-Square test for association.

chi_table<- table(data$GrLivArea, data$LotArea)
suppressWarnings(chisq.test(chi_table))

## 
##  Pearson's Chi-squared test
## 
## data:  chi_table
## X-squared = 944670, df = 921920, p-value < 2.2e-16

We recieve a p value of < 2.2e-16. Therefore, we reject the null hypothesis. The values are significant and therefore our data is independent.

Descriptive and Inferential Statistics

Variable Density Plots

#describe(data)

#Collect summary statistics
GrLivArea.mean <-mean(data$GrLivArea)
GrLivArea.median <-median(data$GrLivArea)
GrLivArea.mode <- as.numeric(names(sort(-table(data$GrLivArea))))[1]
GrLivArea.sd <- sd(data$GrLivArea)
LotArea.mean <-mean(data$LotArea)
LotArea.median <-median(data$LotArea)
LotArea.mode <- as.numeric(names(sort(-table(data$LotArea))))[1]
LotArea.sd <- sd(data$LotArea)

#Create density plot for GrLivArea variable
d_GrLivArea <- density(data$GrLivArea)
plot(d_GrLivArea, main="GrLivArea Probabilities", ylab="Probability", xlab="GrLivArea")
polygon(d_GrLivArea, col="red")
abline(v = GrLivArea.median, col = "green", lwd = 2)
abline(v = GrLivArea.mean, col = "blue", lwd = 2)
abline(v = GrLivArea.mode, col = "purple", lwd = 2)
legend("topright", legend=c("median", "mean","mode"),col=c("green", "blue", "purple"), lty=1, cex=0.8)

#Create density plot for LotArea variable.
LotArea_for_graph <- density(data$LotArea, na.rm=TRUE)
plot(LotArea_for_graph, main="LotArea Probabilities", ylab="Probability", xlab="LotArea")
polygon(LotArea_for_graph, col="red")
abline(v = LotArea.median, col = "green", lwd = 2)
abline(v = LotArea.mean, col = "blue", lwd = 2)
abline(v = LotArea.mode, col = "purple", lwd = 2)
legend("topright", legend=c("median", "mean","mode"),col=c("green", "blue", "purple"), lty=1, cex=0.8, )

Scatterplot

plot(data$GrLivArea,data$LotArea, main="GrLivArea vs LotArea Scatterplot", 
     xlab="GrLivArea", ylab="LotArea", pch=19)

Provide a 95% Confidence Interval for the difference in the mean of the variables.

t.test(data$GrLivArea,data$LotArea)

## 
##  Welch Two Sample t-test
## 
## data:  data$GrLivArea and data$LotArea
## t = -34.411, df = 1467.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -9514.482 -8488.247
## sample estimates:
## mean of x mean of y 
##  1515.464 10516.828

We recieve a the values -9514.482 & -8488.247.

Derive a correlation matrix for two of the quantitative variables you selected.

corMatrix<-cor(data[, which(names(data) %in% c("LotArea", "GrLivArea"))])
corMatrix

##             LotArea GrLivArea
## LotArea   1.0000000 0.2631162
## GrLivArea 0.2631162 1.0000000

These results show a very low but possible positive correlation between the data (<0.3 value).

Test the hypothesis that the correlation between these variables is 0 and provide a 99% confidence interval.

t.test(data$GrLivArea,data$LotArea, conf.level=0.99)

## 
##  Welch Two Sample t-test
## 
## data:  data$GrLivArea and data$LotArea
## t = -34.411, df = 1467.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 99 percent confidence interval:
##  -9676.036 -8326.692
## sample estimates:
## mean of x mean of y 
##  1515.464 10516.828

With another Two Sample t-test we recieve -9676.036 & -8326.692. The values are now more disparate to the 95% CI test.

Linear Algebra and Correlation

Invert your correlation matrix (Precision matrix).

corMatrixInverse <- ginv(corMatrix)
corMatrixInverse

##            [,1]       [,2]
## [1,]  1.0743794 -0.2826866
## [2,] -0.2826866  1.0743794

The diagonal elements represent variance inflation factors, which measures the relationship between combinations between variables.

Multiply the correlation matrix by the precision matrix.

This is the identity Matrix

matrix1<- corMatrixInverse %*% corMatrix
matrix1

##           LotArea    GrLivArea
## [1,] 1.000000e+00 1.110223e-16
## [2,] 3.330669e-16 1.000000e+00

Multiply the precision matrix by the correlation matrix.

This represents how matrix products differ depending on the order or direction by which they are multiplied.

matrix2<- corMatrix %*% corMatrixInverse
matrix2

##                   [,1]         [,2]
## LotArea   1.000000e+00 2.220446e-16
## GrLivArea 2.220446e-16 1.000000e+00

Principle Components Analysis

#Perform a log transform on each variable to normalize
dataCopy<-data
dataCopy$GrLivArea<-log(dataCopy$GrLivArea)
dataCopy$LotArea<-log(dataCopy$LotArea)
dataCopy$SalePrice<-log(dataCopy$SalePrice)

#apply PCA and ADD additional parameters for a more interesting interpretation

data.pca<-prcomp(dataCopy[, which(names(dataCopy) %in% c("LotArea", "GrLivArea","SalePrice"))],center = TRUE,scale. = TRUE)
data.pca

## Standard deviations:
## [1] 1.4246936 0.8370813 0.5191755
## 
## Rotation:
##                 PC1        PC2         PC3
## LotArea   0.4746802  0.8799375  0.01971454
## GrLivArea 0.6204099 -0.3503987  0.70164972
## SalePrice 0.6243159 -0.3208281 -0.71224926

summary(data.pca)

## Importance of components:
##                           PC1    PC2     PC3
## Standard deviation     1.4247 0.8371 0.51918
## Proportion of Variance 0.6766 0.2336 0.08985
## Cumulative Proportion  0.6766 0.9102 1.00000

biplot(data.pca)

theta <- seq(0,2*pi,length.out = 100)
circle <- data.frame(x = cos(theta), y = sin(theta))
p <- ggplot(circle,aes(x,y)) + geom_path()

loadings <- data.frame(data.pca$rotation, 
                       .names = row.names(data.pca$rotation))
p + geom_text(data=loadings, 
              mapping=aes(x = PC1, y = PC2, label = .names, colour = .names)) +
  coord_fixed(ratio=1) +
  labs(x = "PC1", y = "PC2")

screeplot(data.pca, type="lines")

In this interpretation, we see a very strong association between SalePrice and GrLivArea.

Calculus-Based Probability & Statistics

We take the right-skewed GrLivArea data and fit it to an exponential function.

#Using the MASS library
#Calculate the optimal lambda for right-skewed GrLivArea
lambda<-fitdistr(data$GrLivArea,"exponential")
lambda$estimate

##        rate 
## 0.000659864

#Transpose the rate into 1000 selected variables as an exponential distribution
pdf_distr<-rexp(1000,lambda$estimate)

#Plot the results of the exponential distribution
hist(pdf_distr, freq = FALSE, breaks = 100, main ="Fitted Exponential PDF with GrLivArea data", xlim = c(1, quantile(pdf_distr, 0.99)))
curve(dexp(x, rate = lambda$estimate), col = "red", add = TRUE)

#Plot the results as compared to the original data
hist(data$GrLivArea, freq = FALSE, breaks = 100, main ="Comparison with original GrLivArea data",xlim = c(1, quantile(data$GrLivArea, 0.99)))
curve(dexp(x, rate = lambda$estimate), col = "red", add = TRUE)

With the exponential PDF:

5th and 95th percentiles using the cumulative distribution function (CDF)

qexp(0.05, rate = lambda$estimate, lower.tail = TRUE, log.p = FALSE)

## [1] 77.73313

qexp(0.95, rate = lambda$estimate, lower.tail = TRUE, log.p = FALSE)

## [1] 4539.924

95% confidence interval from the empirical data, assuming normality

qnorm(0.95,GrLivArea.mean, GrLivArea.sd)

## [1] 2379.802

Empirical 5th percentile and 95th percentile of the data

quantile(data$GrLivArea, c(.05, .95))

##     5%    95% 
##  848.0 2466.1

Discuss

These exercises help us recognize the differences between a exponential equation and our right-tailed data. We are able to find an approximation and likely a connection between an exponential function, but only after the mean, and therefore this is a noticeably incomplete representation.

Part 2: Kaggle Competition

Relevant Links

Competition Details

Sold! How do home features add up to its price tag?

With 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa, this competition challenges you to predict the final price of each home.

Data Loading

source_github <- function(u) {
  suppressWarnings(require(RCurl))
  script <- getURL(u, ssl.verifypeer = FALSE)
  eval(parse(text = script))
}
suppressWarnings(source("https://raw.githubusercontent.com/RobertSellers/r_Scripts/master/regression_suggestions.R"))

train <- read.csv("https://raw.githubusercontent.com/RobertSellers/605_MATH/master/final/train.csv", header = TRUE)
train<-train%>%select(-Id)

test <-read.csv("https://raw.githubusercontent.com/RobertSellers/605_MATH/master/final/test.csv", header = TRUE)

Data Exploration

Predictor Variable analysis

The following script is disabled, however when run, it outputs a .txt file with predictor variable interpretations useful when determining transformations.

Example variable output from suggest function.

Numeric Predictor Analysis

Only numeric variables are shown to save on document space.

predictorPlots(train,train$SalePrice)

Model Selection

Test data was treated to the same variable transformations as the original training data set. A stepwise procedure was used.

reshape<-function(data){
  subset.int<-select_if(data, is.numeric)
  subset.int[is.na(subset.int)] <- 0
  subset.int<-subset.int[complete.cases(subset.int),]
  return(subset.int)
}
trainMod<-reshape(train)
testMod<-reshape(test)
all<-lm(SalePrice~.,trainMod)
null<-lm(SalePrice~1, trainMod)

stepResults<-step(null,scope=list(lower=null,upper=all),direction="both",trace=0)

#append scored data
result<-data.frame('Id'=testMod$Id,'SalePrice'=predict(stepResults, testMod))
result$SalePrice[result$SalePrice<0] <- 0
par(mfrow=c(2,1))
plot(density(trainMod$SalePrice))
plot(density(na.omit(result$SalePrice)))

write.csv(result, file = "result.csv",row.names=FALSE)

Kaggle Competition Results Link

Kaggle Competition Link

Raw CSV Link

DATA 605 - Final Project

Robert Sellers

December 23, 2016

Required Libraries

Relevant Links

Probability

Does splitting the training data in this fashion make them independent?

Does P(A|B)=P(A)P(B)?

Chi-Square test for association.

Descriptive and Inferential Statistics

Variable Density Plots

Scatterplot

Provide a 95% Confidence Interval for the difference in the mean of the variables.

Derive a correlation matrix for two of the quantitative variables you selected.

Test the hypothesis that the correlation between these variables is 0 and provide a 99% confidence interval.

Linear Algebra and Correlation

Invert your correlation matrix (Precision matrix).

Multiply the correlation matrix by the precision matrix.

Multiply the precision matrix by the correlation matrix.

Principle Components Analysis

Calculus-Based Probability & Statistics

Discuss

Part 2: Kaggle Competition

Relevant Links

Competition Details

Data Loading

Data Exploration

Predictor Variable analysis

Numeric Predictor Analysis

Model Selection

Kaggle Competition Results Link