test_data <- read.csv("https://raw.githubusercontent.com/smithchad17/Class605/master/FinalExam/test.csv", header = T, stringsAsFactors = F)
train_data <- read.csv("https://raw.githubusercontent.com/smithchad17/Class605/master/FinalExam/train.csv", header = T, stringsAsFactors = F)
‘GrLivArea’ is the quantitative independent variable
#Store x and y variables in a data frame
x <- train_data$GrLivArea
y <- train_data$SalePrice
df_xy <- data.frame(x, y)
#Get the IQR data from the variables
x_iqr <- quantile(x)
y_iqr <- quantile(y)
#Store the 25% x-value and 50% y-value for analysis
#Since it's a 'named vector', we remove the name to keep the value
x_iqr <- unname(x_iqr[2])
y_iqr <- unname(y_iqr[3])
x_iqr
## [1] 1129.5
y_iqr
## [1] 163000
A
The probablility that X is greater than 1129.5 given that Y is greater than 163000.
\[P(X>x|Y>y)\]
print(paste0("Number of X and Y variables is ", length(x)))
## [1] "Number of X and Y variables is 1460"
print(paste0("Number of Y variables greater than 163000 is ", sum(y > 163000)))
## [1] "Number of Y variables greater than 163000 is 728"
#Create subset of y values greater than 163000
a <- subset(df_xy) %>%
filter(y > 163000)
print(paste0("Number of X variables greater than 1129.5 out of the 728 is ", sum(a$x > 1129.5), ", or ", round(720/728,4)*100,"%"))
## [1] "Number of X variables greater than 1129.5 out of the 728 is 720, or 98.9%"
Answer
\(P(X>x|Y>y) = .989\)
B
The probablility that X is greater than 1129.5 AND that Y is greater than 163000.
\[P(X>x \& Y>y)\]
xg <- sum(df_xy$x > 1129.5)/1460
yg <- sum(df_xy$y > 163000)/1460
print(paste0("Probability of X > 1129.5 is ", round(xg,4)*100, "%"))
## [1] "Probability of X > 1129.5 is 75%"
print(paste0("Probability of Y > 163000 is ", round(yg,4)*100, "%"))
## [1] "Probability of Y > 163000 is 49.86%"
print(paste0("Probability of both happening is P(X) * P(Y): ", round(xg*yg,4)*100, "%"))
## [1] "Probability of both happening is P(X) * P(Y): 37.4%"
Answer
\(P(X>x \& Y>y) = .374\)
C
The probability that X is less than 1229.5 given that Y is greater than 163000
\[P(X<x|Y>x)\]
print(paste0("Number of Y variables greater than 163000 is ", sum(y > 163000)))
## [1] "Number of Y variables greater than 163000 is 728"
print(paste0("Number of X variables less than 1129.5 out of the 728 is ", sum(a$x < 1129.5), ", or ", round(8/728,4)*100,"%"))
## [1] "Number of X variables less than 1129.5 out of the 728 is 8, or 1.1%"
Answer
\(P(X<x|Y>x) = .011\)
Test For Independence
\[P(X|Y)=P(X)P(Y)\]
Test for independence when the probablility that X is greater than 1129.5 given that Y is greater than 163000.
\(P(X>x|Y>y) = P(X|Y) = .989\)
and
\(P(X)P(Y) = 0.374\)
so
\(P(X|Y) \neq P(X)P(Y)\) therefore, the variables are dependent.
#Prob of P(X) and P(Y)
px <- sum(df_xy$x > 1129.5)/1460
py <- sum(df_xy$y > 163000)/1460
#P(X)P(Y)
px*py
## [1] 0.3739726
Since the Chi-test p-value is smaller than the .05 significance level, we reject the null hypothesis that GrLivArea and SalePrice are independent. These values are DEPENDENT.
#Chi-test
#Null hypothesis is that they are independent
#Significant value is 0.05
chisq.test(x, y)
##
## Pearson's Chi-squared test
##
## data: x and y
## X-squared = 589730, df = 569320, p-value < 0.00000000000000022
Analysis of ‘GrLivArea’ values
#Find the quantiles
print("Summary for 'GrLivArea' values are ")
## [1] "Summary for 'GrLivArea' values are "
summary(x)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 334 1130 1464 1515 1777 5642
print(paste0("Standard Deviation is ", round(sd(x),2)))
## [1] "Standard Deviation is 525.48"
mx <- round(mean(x))
my <- round(mean(y))
#Find the Residuals for x and y columns and store in separate columns
#Add those columns to the df_xy data frame
res_x <- c()
res_y <- c()
for(i in x){
res_x <- c(res_x, mx-i)
}
for(i in y){
res_y <- c(res_y, my-i)
}
df_xy$res_x <- res_x
df_xy$res_y <- res_y
df_xy$num <- seq(1:1460)
#Residual Plot
ggplot(df_xy, aes(x = num, y = res_x)) +
geom_point() +
geom_hline(yintercept = 0, color= "red") +
ggtitle("GrLivArea Residual Plot") +
xlab("GrLivArea Values") +
ylab("Mean - GrLivArea")
#Plot histogram
ggplot(df_xy, aes(x = x)) +
geom_histogram( binwidth = 50, fill = "grey", color = "black") +
geom_vline(xintercept = mx, color = "red") +
ggtitle("Histogram of GrLivArea") +
xlab("Living Area Square Footage") +
ylab("Frequency") +
scale_x_continuous(breaks = c(300, mx, 1000, 2000, 3000, 4000, 5000, 5700),
labels = c(300, 1515, 1000, 2000, 3000, 4000, 5000, 5700))
Analysis of ‘SalePrice’ values
print("Summary for 'SalePrice' values are ")
## [1] "Summary for 'SalePrice' values are "
summary(y)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34900 129975 163000 180921 214000 755000
print(paste0("Standard Deviation of y is ", round(sd(y),2)))
## [1] "Standard Deviation of y is 79442.5"
#Residual Plot
ggplot(df_xy, aes(x = num, y = res_y)) +
geom_point() +
geom_hline(yintercept = 0, color= "red") +
ggtitle("SalePrice Residual Plot") +
xlab("SalePrice Values") +
ylab("Mean - SalePrice")
#Histogram Plot
ggplot(df_xy, aes(x = y)) +
geom_histogram(bins = 80, fill = "grey", color = "black") +
geom_vline(xintercept = my, color = "red") +
ggtitle("Histogram of SalePrice") +
xlab("Sale Price") +
ylab("Frequency") +
scale_x_continuous(breaks = c(30000, 100000, my, 300000, 500000, 700000),
labels = c(30000, 100000, 180921, 300000, 500000, 700000))
Comparison of GrLivArea and SalePrice values
Plot has a ‘megaphone’ look to it
library(MASS)
#Scatterplot of x and y
ggplot(df_xy, aes(x = x, y = y)) +
geom_point() +
xlab("GrLivArea") +
ylab("SalePrice") +
ggtitle("GrLivArea vs SalePrice") +
geom_abline(slope = my/mx, color = "red")
Plot the boxcox. Find the lambda value for the data transformation.
#boxcox plot
box <- boxcox(lm(y~x, data = df_xy), lambda = seq(-1,1,by=.1))
#which lambda value is the greatest.
#Pick that number for the transformation
lam <- box$x[which.max(box$y)]
print(paste0("Lambda value for transformation is ", round(lam,1)))
## [1] "Lambda value for transformation is 0.1"
‘megaphone’ look is gone after boxcox transformation.
#boxcox transformation scatterplot of x and y
#Lamda 0.1 transforms the y-value to a power of .1
ggplot(df_xy, aes(x = x, y = y^.1)) +
geom_point() +
xlab("GrLivArea") +
ylab("SalePrice") +
ggtitle("BoxCox Transformation of GrLivArea vs SalePrice")
Nontransformed variables are GrLivArea, TotalBsmtSF, X1stFlrSF, and BedroomAbvGr.
Build a data frame of the variables and create a correlation matrix and invert it.
df_full <- data.frame(train_data$TotalBsmtSF, train_data$GrLivArea, train_data$X1stFlrSF, train_data$BedroomAbvGr)
colnames(df_full) <- c("TotalBsmtSF", "GrLivArea", "FirstFlrSF", "BedroomAbvGr")
head(df_full)
## TotalBsmtSF GrLivArea FirstFlrSF BedroomAbvGr
## 1 856 1710 856 3
## 2 1262 1262 1262 3
## 3 920 1786 920 3
## 4 756 1717 961 3
## 5 1145 2198 1145 4
## 6 796 1362 796 1
#Create a correlation matrix
print("Correlation Matrix")
## [1] "Correlation Matrix"
cor_matrix <- cor(df_full)
cor_matrix
## TotalBsmtSF GrLivArea FirstFlrSF BedroomAbvGr
## TotalBsmtSF 1.00000000 0.4548682 0.8195300 0.05044996
## GrLivArea 0.45486820 1.0000000 0.5660240 0.52126951
## FirstFlrSF 0.81952998 0.5660240 1.0000000 0.12740075
## BedroomAbvGr 0.05044996 0.5212695 0.1274007 1.00000000
#Invert the correlation matrix to make a precision matrix
print("Precision Matrix")
## [1] "Precision Matrix"
inv_cor_matrix <- solve(cor_matrix)
inv_cor_matrix
## TotalBsmtSF GrLivArea FirstFlrSF BedroomAbvGr
## TotalBsmtSF 3.0779061 -0.1013792 -2.4924580 0.2151067
## GrLivArea -0.1013792 2.1106174 -0.9880998 -0.9692013
## FirstFlrSF -2.4924580 -0.9880998 3.5783728 0.1849233
## BedroomAbvGr 0.2151067 -0.9692013 0.1849233 1.4708036
#Correlation Matrix * Precision Matrix
print("Correlation Matrix * Precision Matrix")
## [1] "Correlation Matrix * Precision Matrix"
round(cor_matrix %*% inv_cor_matrix)
## TotalBsmtSF GrLivArea FirstFlrSF BedroomAbvGr
## TotalBsmtSF 1 0 0 0
## GrLivArea 0 1 0 0
## FirstFlrSF 0 0 1 0
## BedroomAbvGr 0 0 0 1
#Precision Matrix * Correlation Matrix
print("Precision Matrix * Correlation Matrix")
## [1] "Precision Matrix * Correlation Matrix"
round(inv_cor_matrix %*% cor_matrix)
## TotalBsmtSF GrLivArea FirstFlrSF BedroomAbvGr
## TotalBsmtSF 1 0 0 0
## GrLivArea 0 1 0 0
## FirstFlrSF 0 0 1 0
## BedroomAbvGr 0 0 0 1
Density Curves:
Gamma (orange) looks to be the best fit
fd_n <- fitdistr(x, "normal")
fd_c <- fitdistr(x, "cauchy")
fd_w <- fitdistr(x, "weibull")
fd_g <- fitdistr(x, "gamma")
fd_ln <- fitdistr(x, "lognormal")
hist(x, breaks = 50, prob=TRUE, main = "Density Plot of X", ylim = c(0, .0011))
curve(dnorm(x, fd_n$estimate[1], fd_n$estimate[2]), col="red", lwd=2, add=T)
curve(dcauchy(x, fd_c$estimate[1], fd_c$estimate[2]), col="blue", lwd=2, add=T)
curve(dlnorm(x, fd_ln$estimate[1], fd_ln$estimate[2]), col="green", lwd=2, add=T)
curve(dgamma(x, fd_g$estimate[1], fd_g$estimate[2]), col="orange", lwd=2, add=T)
curve(dweibull(x, fd_w$estimate[1], fd_w$estimate[2]), col="purple", lwd=2, add=T)
Simulated Distribution
ran <- rgamma(1000, fd_g$estimate[1], fd_g$estimate[2])
hist(ran, breaks = 50, prob = TRUE, main = "Simulated Density Plot")
curve(dgamma(x, fd_g$estimate[1], fd_g$estimate[2]), col="red", lwd=2, add=T)
Find a linear model of best fit
Many models were tried and backwards elimination of the high p-values were removed.
# model1 <- lm(SalePrice ~ Exterior1st+Exterior2nd+MasVnrType+
# MasVnrArea+ExterQual+ExterCond+Foundation+BsmtQual+BsmtCond+BsmtExposure+BsmtFinType1+
# BsmtFinSF1+BsmtFinType2+BsmtFinSF2+BsmtUnfSF+MSSubClass+MSZoning+LotArea+Street+LotShape+LandContour+
# Utilities+LotConfig+LandSlope+Neighborhood+Condition1+Condition2+BldgType+HouseStyle+OverallQual+
# OverallCond+YearBuilt, data = train_data)
#
model2 <- lm(SalePrice ~ YearRemodAdd+RoofStyle+GarageCars+GarageArea+GarageQual+GarageCond+
WoodDeckSF+OpenPorchSF+EnclosedPorch+X3SsnPorch+ScreenPorch+PoolArea+Fence+MiscVal+
MoSold+YrSold+SaleType+SaleCondition+TotalBsmtSF+Heating+HeatingQC+CentralAir+
Electrical+X1stFlrSF+X2ndFlrSF+LowQualFinSF+GrLivArea+BsmtFullBath+BsmtHalfBath+FullBath+
HalfBath+BedroomAbvGr+KitchenAbvGr+KitchenQual+TotRmsAbvGrd+Functional+Fireplaces+FireplaceQu+
GarageType+GarageYrBlt+GarageFinish+RoofMatl+OverallQual+OverallCond, data = train_data)
model3 <- lm(SalePrice ~ BsmtQual+BsmtExposure+BsmtFinType1+LotArea+Neighborhood+OverallQual+OverallCond+
TotalBsmtSF+X1stFlrSF+X2ndFlrSF+KitchenQual+HeatingQC+SaleType, data=train_data)
# model2 <- lm(SalePrice ~ YearRemodAdd+RoofStyle+GarageCars+GarageArea+GarageQual+GarageCond+
# WoodDeckSF+OpenPorchSF+EnclosedPorch+X3SsnPorch+ScreenPorch+PoolArea+Fence+MiscVal+
# MoSold+YrSold+SaleType+SaleCondition+TotalBsmtSF+Heating+HeatingQC+CentralAir+
# Electrical+X1stFlrSF+X2ndFlrSF+LowQualFinSF+GrLivArea+BsmtFullBath+BsmtHalfBath+FullBath+
# HalfBath+BedroomAbvGr+KitchenAbvGr+KitchenQual+TotRmsAbvGrd+Functional+Fireplaces+FireplaceQu+
# GarageType+GarageYrBlt+GarageFinish+RoofMatl+OverallQual+OverallCond, data = train_data)
sum3 <- summary(model3)
sum2 <- summary(model2)
#sum2
#sum3
Find high correlation between independent numeric variables. Highest was only 49%.
train_prep <- read.csv("https://raw.githubusercontent.com/smithchad17/Class605/master/train_prep.csv", header = T, stringsAsFactors = F)
c <- cor(train_prep)
ct <- as.data.frame(as.table(c))
head(ct)
## Var1 Var2 Freq
## 1 X X 1.00000000
## 2 Id X 1.00000000
## 3 MSSubClass X 0.01115648
## 4 LotFrontage X NA
## 5 LotArea X -0.03322552
## 6 OverallQual X -0.02836475
Predict SalePrice, prep results and save to csv file
#Make a subset of the data with the columns the linear model uses
# test_prep <- subset(test_data, select = c(YearRemodAdd,RoofStyle,GarageCars,GarageArea,GarageQual,GarageCond,
# WoodDeckSF,OpenPorchSF,EnclosedPorch,X3SsnPorch,ScreenPorch,PoolArea,Fence,MiscVal,
# MoSold,YrSold,SaleType,SaleCondition,TotalBsmtSF,Heating,HeatingQC,CentralAir,
# Electrical,X1stFlrSF,X2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,
# HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,
# GarageType,GarageYrBlt,GarageFinish,RoofMatl,OverallQual,OverallCond))
test_prep2 <- subset(test_data, select = c(BsmtQual,BsmtExposure,BsmtFinType1,LotArea,
Neighborhood,OverallQual,OverallCond,
TotalBsmtSF,X1stFlrSF,X2ndFlrSF,KitchenQual,
HeatingQC,SaleType))
pred_table <- as.data.frame(as.table(predict(model3, test_prep2)))
pred_table$Id <- test_data$Id
pred_table <- subset(pred_table, select = c(Id, Freq))
colnames(pred_table) <- c("Id", "SalePrice")
head(pred_table)
## Id SalePrice
## 1 1461 113004.4
## 2 1462 162689.9
## 3 1463 169865.4
## 4 1464 194069.3
## 5 1465 240583.8
## 6 1466 165235.9
write.csv(pred_table, "predictions.csv")