Data Prep

test_data <- read.csv("https://raw.githubusercontent.com/smithchad17/Class605/master/FinalExam/test.csv", header = T, stringsAsFactors = F)

train_data <- read.csv("https://raw.githubusercontent.com/smithchad17/Class605/master/FinalExam/train.csv", header = T, stringsAsFactors = F)

Probability

‘GrLivArea’ is the quantitative independent variable

#Store x and y variables in a data frame
x <- train_data$GrLivArea
y <- train_data$SalePrice

df_xy <- data.frame(x, y)

#Get the IQR data from the variables
x_iqr <- quantile(x)
y_iqr <- quantile(y)

#Store the 25% x-value and 50% y-value for analysis
#Since it's a 'named vector', we remove the name to keep the value
x_iqr <- unname(x_iqr[2])
y_iqr <- unname(y_iqr[3])

x_iqr
## [1] 1129.5
y_iqr
## [1] 163000

A

The probablility that X is greater than 1129.5 given that Y is greater than 163000.

\[P(X>x|Y>y)\]

print(paste0("Number of X and Y variables is ", length(x)))
## [1] "Number of X and Y variables is 1460"
print(paste0("Number of Y variables greater than 163000 is ", sum(y > 163000)))
## [1] "Number of Y variables greater than 163000 is 728"
#Create subset of y values greater than 163000
a <- subset(df_xy) %>%
  filter(y > 163000)

print(paste0("Number of X variables greater than 1129.5 out of the 728 is ", sum(a$x > 1129.5), ", or ", round(720/728,4)*100,"%"))
## [1] "Number of X variables greater than 1129.5 out of the 728 is 720, or 98.9%"

Answer

\(P(X>x|Y>y) = .989\)

B

The probablility that X is greater than 1129.5 AND that Y is greater than 163000.

\[P(X>x \& Y>y)\]

xg <- sum(df_xy$x > 1129.5)/1460
yg <- sum(df_xy$y > 163000)/1460

print(paste0("Probability of X > 1129.5 is ", round(xg,4)*100, "%"))
## [1] "Probability of X > 1129.5 is 75%"
print(paste0("Probability of Y > 163000 is ", round(yg,4)*100, "%"))
## [1] "Probability of Y > 163000 is 49.86%"
print(paste0("Probability of both happening is P(X) * P(Y): ", round(xg*yg,4)*100, "%"))
## [1] "Probability of both happening is P(X) * P(Y): 37.4%"

Answer

\(P(X>x \& Y>y) = .374\)

C

The probability that X is less than 1229.5 given that Y is greater than 163000

\[P(X<x|Y>x)\]

print(paste0("Number of Y variables greater than 163000 is ", sum(y > 163000)))
## [1] "Number of Y variables greater than 163000 is 728"
print(paste0("Number of X variables less than 1129.5 out of the 728 is ", sum(a$x < 1129.5), ", or ", round(8/728,4)*100,"%"))
## [1] "Number of X variables less than 1129.5 out of the 728 is 8, or 1.1%"

Answer

\(P(X<x|Y>x) = .011\)

Test For Independence

\[P(X|Y)=P(X)P(Y)\]

Test for independence when the probablility that X is greater than 1129.5 given that Y is greater than 163000.

\(P(X>x|Y>y) = P(X|Y) = .989\)

and

\(P(X)P(Y) = 0.374\)

so

\(P(X|Y) \neq P(X)P(Y)\) therefore, the variables are dependent.

#Prob of P(X) and P(Y)
px <- sum(df_xy$x > 1129.5)/1460
py <- sum(df_xy$y > 163000)/1460

#P(X)P(Y)
px*py
## [1] 0.3739726

Since the Chi-test p-value is smaller than the .05 significance level, we reject the null hypothesis that GrLivArea and SalePrice are independent. These values are DEPENDENT.

#Chi-test
#Null hypothesis is that they are independent
#Significant value is 0.05

chisq.test(x, y)
## 
##  Pearson's Chi-squared test
## 
## data:  x and y
## X-squared = 589730, df = 569320, p-value < 0.00000000000000022

Descriptive and Inferential Statistics

Analysis of ‘GrLivArea’ values

#Find the quantiles
print("Summary for 'GrLivArea' values are ")
## [1] "Summary for 'GrLivArea' values are "
summary(x)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     334    1130    1464    1515    1777    5642
print(paste0("Standard Deviation is ", round(sd(x),2)))
## [1] "Standard Deviation is 525.48"
mx <- round(mean(x))
my <- round(mean(y))

#Find the Residuals for x and y columns and store in separate columns
#Add those columns to the df_xy data frame
res_x <- c()
res_y <- c()
for(i in x){
  res_x <- c(res_x, mx-i)
}
for(i in y){
  res_y <- c(res_y, my-i)
}

df_xy$res_x <- res_x
df_xy$res_y <- res_y
df_xy$num <- seq(1:1460)

#Residual Plot
ggplot(df_xy, aes(x = num, y = res_x)) +
  geom_point() + 
  geom_hline(yintercept = 0, color= "red") +
  ggtitle("GrLivArea Residual Plot") +
  xlab("GrLivArea Values") +
  ylab("Mean - GrLivArea")

#Plot histogram
ggplot(df_xy, aes(x = x)) +
  geom_histogram( binwidth = 50, fill = "grey", color = "black") + 
  geom_vline(xintercept = mx, color = "red") +
  ggtitle("Histogram of GrLivArea") +
  xlab("Living Area Square Footage") +
  ylab("Frequency") +
  scale_x_continuous(breaks = c(300, mx, 1000, 2000, 3000, 4000, 5000, 5700),
                     labels = c(300, 1515, 1000, 2000, 3000, 4000, 5000, 5700))

Analysis of ‘SalePrice’ values

print("Summary for 'SalePrice' values are ")
## [1] "Summary for 'SalePrice' values are "
summary(y)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129975  163000  180921  214000  755000
print(paste0("Standard Deviation of y is ", round(sd(y),2)))
## [1] "Standard Deviation of y is 79442.5"
#Residual Plot
ggplot(df_xy, aes(x = num, y = res_y)) +
  geom_point() + 
  geom_hline(yintercept = 0, color= "red") +
  ggtitle("SalePrice Residual Plot") +
  xlab("SalePrice Values") +
  ylab("Mean - SalePrice")

#Histogram Plot
ggplot(df_xy, aes(x = y)) +
  geom_histogram(bins = 80, fill = "grey", color = "black") + 
  geom_vline(xintercept = my, color = "red") +
  ggtitle("Histogram of SalePrice") +
  xlab("Sale Price") +
  ylab("Frequency") +
  scale_x_continuous(breaks = c(30000, 100000, my, 300000, 500000, 700000),
                     labels = c(30000, 100000, 180921, 300000, 500000, 700000))

Comparison of GrLivArea and SalePrice values

Plot has a ‘megaphone’ look to it

library(MASS)

#Scatterplot of x and y
ggplot(df_xy, aes(x = x, y = y)) +
  geom_point() + 
  xlab("GrLivArea") +
  ylab("SalePrice") + 
  ggtitle("GrLivArea vs SalePrice") +
  geom_abline(slope = my/mx, color = "red")

Plot the boxcox. Find the lambda value for the data transformation.

#boxcox plot
box <- boxcox(lm(y~x, data = df_xy), lambda = seq(-1,1,by=.1))

#which lambda value is the greatest.
#Pick that number for the transformation
lam <- box$x[which.max(box$y)]
print(paste0("Lambda value for transformation is ", round(lam,1)))
## [1] "Lambda value for transformation is 0.1"

‘megaphone’ look is gone after boxcox transformation.

#boxcox transformation scatterplot of x and y
#Lamda 0.1 transforms the y-value to a power of .1
ggplot(df_xy, aes(x = x, y = y^.1)) +
  geom_point() + 
  xlab("GrLivArea") +
  ylab("SalePrice") + 
  ggtitle("BoxCox Transformation of GrLivArea vs SalePrice") 

Linear Algebra and Correlation

Nontransformed variables are GrLivArea, TotalBsmtSF, X1stFlrSF, and BedroomAbvGr.

Build a data frame of the variables and create a correlation matrix and invert it.

df_full <- data.frame(train_data$TotalBsmtSF, train_data$GrLivArea, train_data$X1stFlrSF, train_data$BedroomAbvGr)
colnames(df_full) <- c("TotalBsmtSF", "GrLivArea", "FirstFlrSF", "BedroomAbvGr")
head(df_full)
##   TotalBsmtSF GrLivArea FirstFlrSF BedroomAbvGr
## 1         856      1710        856            3
## 2        1262      1262       1262            3
## 3         920      1786        920            3
## 4         756      1717        961            3
## 5        1145      2198       1145            4
## 6         796      1362        796            1
#Create a correlation matrix
print("Correlation Matrix")
## [1] "Correlation Matrix"
cor_matrix <- cor(df_full)
cor_matrix
##              TotalBsmtSF GrLivArea FirstFlrSF BedroomAbvGr
## TotalBsmtSF   1.00000000 0.4548682  0.8195300   0.05044996
## GrLivArea     0.45486820 1.0000000  0.5660240   0.52126951
## FirstFlrSF    0.81952998 0.5660240  1.0000000   0.12740075
## BedroomAbvGr  0.05044996 0.5212695  0.1274007   1.00000000
#Invert the correlation matrix to make a precision matrix
print("Precision Matrix")
## [1] "Precision Matrix"
inv_cor_matrix <- solve(cor_matrix)
inv_cor_matrix
##              TotalBsmtSF  GrLivArea FirstFlrSF BedroomAbvGr
## TotalBsmtSF    3.0779061 -0.1013792 -2.4924580    0.2151067
## GrLivArea     -0.1013792  2.1106174 -0.9880998   -0.9692013
## FirstFlrSF    -2.4924580 -0.9880998  3.5783728    0.1849233
## BedroomAbvGr   0.2151067 -0.9692013  0.1849233    1.4708036
#Correlation Matrix * Precision Matrix
print("Correlation Matrix * Precision Matrix")
## [1] "Correlation Matrix * Precision Matrix"
round(cor_matrix %*% inv_cor_matrix)
##              TotalBsmtSF GrLivArea FirstFlrSF BedroomAbvGr
## TotalBsmtSF            1         0          0            0
## GrLivArea              0         1          0            0
## FirstFlrSF             0         0          1            0
## BedroomAbvGr           0         0          0            1
#Precision Matrix * Correlation Matrix
print("Precision Matrix * Correlation Matrix")
## [1] "Precision Matrix * Correlation Matrix"
round(inv_cor_matrix %*% cor_matrix)
##              TotalBsmtSF GrLivArea FirstFlrSF BedroomAbvGr
## TotalBsmtSF            1         0          0            0
## GrLivArea              0         1          0            0
## FirstFlrSF             0         0          1            0
## BedroomAbvGr           0         0          0            1

Calculus-Bases Probability & Statistics

Density Curves:

  • Normal = Red
  • Cauchy = Blue
  • Weibull = Purple
  • Gamma = Orange
  • LogNormal = Green

Gamma (orange) looks to be the best fit

fd_n <- fitdistr(x, "normal")
fd_c <- fitdistr(x, "cauchy")
fd_w <- fitdistr(x, "weibull")
fd_g <- fitdistr(x, "gamma")
fd_ln <- fitdistr(x, "lognormal")

hist(x, breaks = 50, prob=TRUE, main = "Density Plot of X", ylim = c(0, .0011))
curve(dnorm(x, fd_n$estimate[1], fd_n$estimate[2]), col="red", lwd=2, add=T)
curve(dcauchy(x, fd_c$estimate[1], fd_c$estimate[2]), col="blue", lwd=2, add=T)
curve(dlnorm(x, fd_ln$estimate[1], fd_ln$estimate[2]), col="green", lwd=2, add=T)
curve(dgamma(x, fd_g$estimate[1], fd_g$estimate[2]), col="orange", lwd=2, add=T)
curve(dweibull(x, fd_w$estimate[1], fd_w$estimate[2]), col="purple", lwd=2, add=T)

Simulated Distribution

ran <- rgamma(1000, fd_g$estimate[1], fd_g$estimate[2])

hist(ran, breaks = 50, prob = TRUE, main = "Simulated Density Plot")
curve(dgamma(x, fd_g$estimate[1], fd_g$estimate[2]), col="red", lwd=2, add=T)

Modeling

Find a linear model of best fit

Many models were tried and backwards elimination of the high p-values were removed.

# model1 <- lm(SalePrice ~ Exterior1st+Exterior2nd+MasVnrType+
#                MasVnrArea+ExterQual+ExterCond+Foundation+BsmtQual+BsmtCond+BsmtExposure+BsmtFinType1+
#               BsmtFinSF1+BsmtFinType2+BsmtFinSF2+BsmtUnfSF+MSSubClass+MSZoning+LotArea+Street+LotShape+LandContour+
#                Utilities+LotConfig+LandSlope+Neighborhood+Condition1+Condition2+BldgType+HouseStyle+OverallQual+
#               OverallCond+YearBuilt, data = train_data)
# 
model2 <- lm(SalePrice ~ YearRemodAdd+RoofStyle+GarageCars+GarageArea+GarageQual+GarageCond+
              WoodDeckSF+OpenPorchSF+EnclosedPorch+X3SsnPorch+ScreenPorch+PoolArea+Fence+MiscVal+
            MoSold+YrSold+SaleType+SaleCondition+TotalBsmtSF+Heating+HeatingQC+CentralAir+
               Electrical+X1stFlrSF+X2ndFlrSF+LowQualFinSF+GrLivArea+BsmtFullBath+BsmtHalfBath+FullBath+
               HalfBath+BedroomAbvGr+KitchenAbvGr+KitchenQual+TotRmsAbvGrd+Functional+Fireplaces+FireplaceQu+
              GarageType+GarageYrBlt+GarageFinish+RoofMatl+OverallQual+OverallCond, data = train_data)

model3 <- lm(SalePrice ~ BsmtQual+BsmtExposure+BsmtFinType1+LotArea+Neighborhood+OverallQual+OverallCond+
               TotalBsmtSF+X1stFlrSF+X2ndFlrSF+KitchenQual+HeatingQC+SaleType, data=train_data)

# model2 <- lm(SalePrice ~ YearRemodAdd+RoofStyle+GarageCars+GarageArea+GarageQual+GarageCond+
#               WoodDeckSF+OpenPorchSF+EnclosedPorch+X3SsnPorch+ScreenPorch+PoolArea+Fence+MiscVal+
#             MoSold+YrSold+SaleType+SaleCondition+TotalBsmtSF+Heating+HeatingQC+CentralAir+
#                Electrical+X1stFlrSF+X2ndFlrSF+LowQualFinSF+GrLivArea+BsmtFullBath+BsmtHalfBath+FullBath+
#                HalfBath+BedroomAbvGr+KitchenAbvGr+KitchenQual+TotRmsAbvGrd+Functional+Fireplaces+FireplaceQu+
#               GarageType+GarageYrBlt+GarageFinish+RoofMatl+OverallQual+OverallCond, data = train_data)


sum3 <- summary(model3)
sum2 <- summary(model2)

#sum2
#sum3

Find high correlation between independent numeric variables. Highest was only 49%.

train_prep <- read.csv("https://raw.githubusercontent.com/smithchad17/Class605/master/train_prep.csv", header = T, stringsAsFactors = F)
c <- cor(train_prep)
ct <- as.data.frame(as.table(c))
head(ct)
##          Var1 Var2        Freq
## 1           X    X  1.00000000
## 2          Id    X  1.00000000
## 3  MSSubClass    X  0.01115648
## 4 LotFrontage    X          NA
## 5     LotArea    X -0.03322552
## 6 OverallQual    X -0.02836475

Predict SalePrice, prep results and save to csv file

#Make a subset of the data with the columns the linear model uses
# test_prep <- subset(test_data, select = c(YearRemodAdd,RoofStyle,GarageCars,GarageArea,GarageQual,GarageCond,
#               WoodDeckSF,OpenPorchSF,EnclosedPorch,X3SsnPorch,ScreenPorch,PoolArea,Fence,MiscVal,
#             MoSold,YrSold,SaleType,SaleCondition,TotalBsmtSF,Heating,HeatingQC,CentralAir,
#                Electrical,X1stFlrSF,X2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,
#                HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,
#               GarageType,GarageYrBlt,GarageFinish,RoofMatl,OverallQual,OverallCond))

test_prep2 <- subset(test_data, select = c(BsmtQual,BsmtExposure,BsmtFinType1,LotArea,
                                             Neighborhood,OverallQual,OverallCond,
                                          TotalBsmtSF,X1stFlrSF,X2ndFlrSF,KitchenQual,
                                             HeatingQC,SaleType))

pred_table <- as.data.frame(as.table(predict(model3, test_prep2)))
pred_table$Id <- test_data$Id
pred_table <- subset(pred_table, select = c(Id, Freq))
colnames(pred_table) <- c("Id", "SalePrice")
head(pred_table)
##     Id SalePrice
## 1 1461  113004.4
## 2 1462  162689.9
## 3 1463  169865.4
## 4 1464  194069.3
## 5 1465  240583.8
## 6 1466  165235.9
write.csv(pred_table, "predictions.csv")