Read in Libraries and Data

#library(dplyr)
#library(knitr)
#library(moments)
#library(kableExtra)
#library(reshape2)
#library(ggplot2)
#library(tidyr)
#library(corrplot)
#library(MASS)
train = read.csv("/Users/Michele/Desktop/data_605_final/train.csv", as.is = TRUE)
test = read.csv("/Users/Michele/Desktop/data_605_final/test.csv", as.is = TRUE)
#filter out qualitative datasets
qualitiative_train = select_if(train, is.numeric)

Choosing an Independent Variable

– Pick one of the quantitative independent variables from the training data set (train.csv) , and define that variable as X. Make sure this variable is skewed to the right!

SalePrice

This is a good independent variable to determine and it is skewed to the right.

saleprice_density <- qualitiative_train %>%
  ggplot(aes(SalePrice)) + geom_density()
saleprice_density

X = qualitiative_train$SalePrice

summary(X)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129975  163000  180921  214000  755000

Choosing a Dependent Variable

– Pick the dependent variable and define it as Y.

A good variable to use to determine SalePrice would be GrLivArea, which indicates the amount of above ground living area for the building.

GrLivArea_density <- qualitiative_train %>%
  ggplot(aes(GrLivArea)) + geom_density()
GrLivArea_density

Y = qualitiative_train$GrLivArea

summary(Y)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     334    1130    1464    1515    1777    5642

Probability

Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 1st quartile of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities. In addition, make a table of counts as shown below.

#subset data we desire
variables <- c("SalePrice", "GrLivArea")
data <- train[variables]

#obtain 25 percentiles
x = quantile(X, 0.25)
y = quantile(Y, 0.25)

x
##    25% 
## 129975
y
##    25% 
## 1129.5

a. P(X>x | Y>y)

Here, we are calculating the probability Y>y and the probabiltity X>x AND Y>y to determine the probabiltity X>x given Y>y.

\[P(B|A) = \dfrac{P(A \ and \ B)}{P(A)}\]

#calculate number of X > 25% of X
greater_X = X[X>x]
#calculate number of Y > 25% of Y
greater_Y = Y[Y>y]
#calculate number of X > 25% of X AND Y > 25% of Y 
data$results <- ifelse(
    ((data$SalePrice > x) & (data$GrLivArea > y)), 1, 0)
# P(X>x|Y>y) = {P(Y>y and X>x)}{P(Y>y)}
(sum(data$results)/length(data$results))/ (length(greater_Y)/length(Y))
## [1] 0.8712329

b. P(X>x, Y>y)

Here, we are calculating the probability that either variable is greater than the 25th percentile, then subtracting the probability that both variables are greater than then 25th percentile.

\[P(A \ \cup \ B) = P(A) \ + \ P(B) - P(A \ \cap \ B)\]

#calculate number of X > 25% of X
greater_X = X[X>x]
#calculate number of Y > 25% of Y
greater_Y = Y[Y>y]
#calculate number of X > 25% of X AND Y > 25% of Y 
data$results <- ifelse(
    ((data$SalePrice > x) & (data$GrLivArea > y)), 1, 0)
# P(X>x) + P(Y>y) - P(X>x) U P(Y>y)
((length(greater_X)/length(X)) + (length(greater_Y)/length(Y))) - (sum(data$results)/length(data$results))
## [1] 0.8465753

c. P(Xy)

Here, we are calculating the probability Y>y and the probabiltity Xy to determine the probabiltity Xy.

\[P(B|A) = \dfrac{P(A \ and \ B)}{P(A)}\]

#calculate number of X < 25% of X
less_X = X[X<x]
#calculate number of Y > 25% of Y
greater_Y = Y[Y>y]
#calculate number of X > 25% of X AND Y > 25% of Y 
data$results <- ifelse(
    ((data$SalePrice < x) & (data$GrLivArea > y)), 1, 0)
# P(X>x|Y>y) = {P(Y>y and X>x)}{P(Y>y)}
(sum(data$results)/length(data$results))/ (length(greater_Y)/length(Y))
## [1] 0.1287671

Table of Counts

both_greater = nrow(subset(data, GrLivArea > y & SalePrice > x))
x_greater_y_less = nrow(subset(data, GrLivArea < y & SalePrice > x))
both_less = nrow(subset(data, GrLivArea < y & SalePrice < x))
x_less_y_greater = nrow(subset(data, GrLivArea > y & SalePrice < x))

x_y = c("greater first quartile", "less first quartile", "Total") 
greater_first_quartile = c(both_greater, x_less_y_greater, both_greater + x_less_y_greater) 
less_first_quartile = c(x_greater_y_less, both_less, x_greater_y_less + both_less) 
Total = c(both_greater+x_greater_y_less, x_less_y_greater+both_less, both_greater + x_less_y_greater + x_greater_y_less + both_less)
data.frame(x_y, greater_first_quartile, less_first_quartile, Total) 

Table of Probabilities

n = (nrow(data))

x_y = c("greater first quartile", "less first quartile", "Total") 
greater_first_quartile = c(both_greater/n, x_less_y_greater/n, both_greater/n + x_less_y_greater/n) 
less_first_quartile = c(x_greater_y_less/n, both_less/n, x_greater_y_less/n + both_less/n) 
Total = c(both_greater/n+x_greater_y_less/n, x_less_y_greater/n+both_less/n, both_greater/n + x_less_y_greater/n + x_greater_y_less/n + both_less/n)
data.frame(x_y, greater_first_quartile, less_first_quartile, Total) 

Independence

Does splitting the training data in this fashion make them independent? Let A be the new variable counting those observations above the 1st quartile for X, and let B be the new variable counting those observations above the 1st quartile for Y. Does P(AB)=P(A)P(B)? Check mathematically, and then evaluate by running a Chi Square test for association.

P(AB) =

A = length(X[X>x])
B = length(Y[Y>y])
data$results <- ifelse(
    ((data$SalePrice > x) & (data$GrLivArea > y)), 1, 0)

((A/length(X)) + (B/length(Y))) - (sum(data$results)/length(data$results))
## [1] 0.8465753

P(A)*P(B) =

((A/length(X)) * (B/length(Y))) 
## [1] 0.5625

Splitting the training data in this fashion does not make them independent since we expect Gross Living Area to affect the Sales Price.

Chi Square Test

The chi square test for independence compares two variables to see if they are related. A small chi-sqare p-value demonstrates that there is a significant association between the two variables. We are testing SalesPrice and Gross Living Area below. Notice that the chi-square value is very low at 2.2e-16, indicating that there is a significant association between these two variables and that therefore these variables are not independent of one another.

chisq.test(table(data$SalePrice, data$GrLivArea))
## Warning in chisq.test(table(data$SalePrice, data$GrLivArea)): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(data$SalePrice, data$GrLivArea)
## X-squared = 589730, df = 569320, p-value < 2.2e-16

Descriptive and Inferential Statistics.

Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot of X and Y. Derive a correlation matrix for any THREE quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide a 92% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

Univariate Descriptive Statistics

#generate clean summary table
training_summary_df <- do.call(data.frame, 
           list(n = sapply(qualitiative_train, function(x) length(x[!is.na(x)])),
                mean = round(sapply(qualitiative_train, function(x) mean(x, na.rm=TRUE)), 2),
                median = round(sapply(qualitiative_train, function(x) median(x, na.rm=TRUE)), 2),
                sd = round(sapply(qualitiative_train, function(x) sd(x, na.rm=TRUE)), 2),
                se = round(sapply(qualitiative_train, function(x) sd(x,na.rm=TRUE)/(sqrt(length(x[!is.na(x)])))), 2),
                min = round(sapply(qualitiative_train, function(x) min(x, na.rm=TRUE)), 2),
                max = round(sapply(qualitiative_train, function(x) max(x, na.rm=TRUE)), 2),
                range = round(sapply(qualitiative_train, function(x) max(x, na.rm=TRUE) - min(x, na.rm=TRUE)), 2),
                kurtosis = round(sapply(qualitiative_train, function(x) kurtosis(x, na.rm=TRUE))), 2))
training_summary_df

Boxplots

meltData <- melt(qualitiative_train)
## No id variables; using all as measure variables
p <- ggplot(meltData, aes(factor(variable), value)) 
p + geom_boxplot() + facet_wrap(~variable, scale="free")
## Warning: Removed 348 rows containing non-finite values (stat_boxplot).

Density Plots

density <- qualitiative_train %>%
  gather() %>%                         
  ggplot(aes(value)) +                 
    facet_wrap(~ key, scales = "free") +
    geom_density()
density
## Warning: Removed 348 rows containing non-finite values (stat_density).

Correlation Matrix for Some Significant Relationships

headers = c("OverallQual", "GarageCars", "SalePrice")
correlation = cor(qualitiative_train[headers])
correlation
##             OverallQual GarageCars SalePrice
## OverallQual   1.0000000  0.6006707 0.7909816
## GarageCars    0.6006707  1.0000000 0.6404092
## SalePrice     0.7909816  0.6404092 1.0000000
corrplot(correlation, method="square")

SalePrice and OverallQual

The first correlation we will visit is between SalePrice and OverallQual (a variables that rates the overall material and finish of the house). Below is a correlation t-test used to determine if there is an association between paired samples. Below, we are showing that using a 92% confidence interval we can reject the null hypothesis that the true correlation between overall quality and sale price is 0. Sales Price and Overall Quality has a good correlation of .79. We are 92% confident that the true correlation is between .77 and .81.

cor.test(train$OverallQual, train$SalePrice, method = "pearson" , conf.level = 0.92)
## 
##  Pearson's product-moment correlation
## 
## data:  train$OverallQual and train$SalePrice
## t = 49.364, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
##  0.7731789 0.8075387
## sample estimates:
##       cor 
## 0.7909816

Below is a linear regression summary of the data.

sale_price_overall_qual = lm(qualitiative_train$SalePrice ~ qualitiative_train$OverallQual)
summary(sale_price_overall_qual)
## 
## Call:
## lm(formula = qualitiative_train$SalePrice ~ qualitiative_train$OverallQual)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -198152  -29409   -1845   21463  396848 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -96206.1     5756.4  -16.71   <2e-16 ***
## qualitiative_train$OverallQual  45435.8      920.4   49.36   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 48620 on 1458 degrees of freedom
## Multiple R-squared:  0.6257, Adjusted R-squared:  0.6254 
## F-statistic:  2437 on 1 and 1458 DF,  p-value: < 2.2e-16
plot(qualitiative_train$OverallQual, qualitiative_train$SalePrice)
abline(sale_price_overall_qual)

SalePrice and GarageCars

The second correlation we will visit is between SalePrice and GarageCars (the size of garage in car capacity). Below is a correlation t-test used to determine if there is an association between paired samples. Below, we are showing that using a 92% confidence interval we can reject the null hypothesis that the true correlation between garage cars and sale price is 0. Sales Price and Garage Cars has a good correlation of .64. We are 92% confident that the true correlation is between .61 and .67.

cor.test(train$GarageCars, train$SalePrice, method = "pearson" , conf.level = 0.92)
## 
##  Pearson's product-moment correlation
## 
## data:  train$GarageCars and train$SalePrice
## t = 31.839, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
##  0.6125561 0.6666738
## sample estimates:
##       cor 
## 0.6404092

Below is a linear regression summary of the data.

sale_price_overall_qual = lm(qualitiative_train$SalePrice ~ qualitiative_train$GarageCars)
summary(sale_price_overall_qual)
## 
## Call:
## lm(formula = qualitiative_train$SalePrice ~ qualitiative_train$GarageCars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -209931  -36775   -6775   25303  490147 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      60619       4102   14.78   <2e-16 ***
## qualitiative_train$GarageCars    68078       2138   31.84   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 61040 on 1458 degrees of freedom
## Multiple R-squared:  0.4101, Adjusted R-squared:  0.4097 
## F-statistic:  1014 on 1 and 1458 DF,  p-value: < 2.2e-16
plot(qualitiative_train$GarageCars, qualitiative_train$SalePrice)
abline(sale_price_overall_qual)

OverallQual and GarageCars

The final pairwise correlation we will visit is between OverallQual ((a variables that rates the overall material and finish of the hous) and GarageCars (the size of garage in car capacity). Below is a correlation t-test used to determine if there is an association between paired samples. Below, we are showing that using a 92% confidence interval we can reject the null hypothesis that the true correlation between garage cars and sale price is 0. Overall Quality and Garage Cars has a good correlation of .60. We are 92% confident that the true correlation is between .57 and .62.

cor.test(train$GarageCars, train$OverallQual, method = "pearson" , conf.level = 0.92)
## 
##  Pearson's product-moment correlation
## 
## data:  train$GarageCars and train$OverallQual
## t = 28.688, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
##  0.5705454 0.6291817
## sample estimates:
##       cor 
## 0.6006707

Below is a linear regression summary of the data.

sale_price_overall_qual = lm(qualitiative_train$OverallQual ~ qualitiative_train$GarageCars)
summary(sale_price_overall_qual)
## 
## Call:
## lm(formula = qualitiative_train$OverallQual ~ qualitiative_train$GarageCars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5814 -0.3582 -0.2466  0.6418  3.8650 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    4.13496    0.07434   55.62   <2e-16 ***
## qualitiative_train$GarageCars  1.11161    0.03875   28.69   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.106 on 1458 degrees of freedom
## Multiple R-squared:  0.3608, Adjusted R-squared:  0.3604 
## F-statistic:   823 on 1 and 1458 DF,  p-value: < 2.2e-16
plot(qualitiative_train$GarageCars, qualitiative_train$OverallQual)
abline(sale_price_overall_qual)

Familywise Error

We should be worried about Type II error when performing analysis with this kind of data because there are a few variables that are correlated with each other and may influence the other. For example, the number of Garage Cars likely influences the overall quality of the house. Therefore we have potential multicolinearity in this model, and we should look out for incorporating many related variables and determining we have a good multiple linear regression model when in actuality we are generating a Type II error, where we retain a false hypothesis.

Linear Algebra and Correlation.

Invert your 3 x 3 correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.

#inverse of correlation matrix
correlation_inv = solve(correlation)
correlation_inv
##             OverallQual GarageCars  SalePrice
## OverallQual   2.7829510 -0.4440392 -1.9168963
## GarageCars   -0.4440392  1.7661209 -0.7798133
## SalePrice    -1.9168963 -0.7798133  3.0156293
#multiply the precision matrix by the correlation matrix
correlation_inv %*% correlation
##              OverallQual    GarageCars     SalePrice
## OverallQual 1.000000e+00 -4.440892e-16 -4.440892e-16
## GarageCars  0.000000e+00  1.000000e+00  0.000000e+00
## SalePrice   2.220446e-16  0.000000e+00  1.000000e+00
#multiply the correlation matrix by the precision matrix
correlation %*% correlation_inv
##               OverallQual GarageCars SalePrice
## OverallQual  1.000000e+00          0         0
## GarageCars  -2.220446e-16          1         0
## SalePrice   -4.440892e-16          0         1

Lower Upper Decomposition

ludecomp <- function(A){

if (nrow(A) == 2){
  value <- -A[2,1] / A[1,1]
  multiply <- matrix(c(1, value, 0, 1), nrow=2, ncol=2) 
  lower_triangular_matrix <- multiply %*% A
  
} else if (nrow(A) == 3){
  value <- (A[,1]/(-1*A[1,1]))
  multiply <- matrix(c(1, value[2], value[3], 0, 1, 0, 0, 0, 1), nrow=3, ncol=3) 
  transformation_1 <- multiply %*% A
  
  value_2 <- -1*(transformation_1[3,2]/transformation_1[2,2])
  multiply_2 <- matrix(c(1, 0, 0, 0, 1, value_2, 0, 0, 1), nrow=3, ncol=3) 
  lower_triangular_matrix <- multiply_2 %*% transformation_1
  
} else if (nrow(A) == 4){
  value <- (A[,1]/(-1*A[1,1]))
  multiply <- matrix(c(1, value[2], value[3], value[4], 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1), nrow=4, ncol=4)
  transformation_1 <- multiply %*% A
  
  value_1 <- (transformation_1[,2]/(-1*transformation_1[2,2]))
  multiply_1 <- matrix(c(1, 0, 0, 0, 0, 1, value_1[3], value_1[4], 0, 0, 1, 0, 0, 0, 0, 1), nrow=4, ncol=4)
  transformation_2 <- round(multiply_1 %*% transformation_1, 1)
  
  value_2 <- round((-1*(transformation_2[4,3])/(transformation_2[3,3])), 1)
  multiply_2 <- matrix(c(1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, value_2, 0, 0, 0, 1), nrow=4, ncol=4)
  lower_triangular_matrix <- round(multiply_2 %*% transformation_2, 1) #account for rounding errors
}

  return (list(lower_triangular_matrix = lower_triangular_matrix))
}
ludecomp(correlation %*% correlation_inv)
## $lower_triangular_matrix
##      OverallQual GarageCars SalePrice
## [1,]           1          0         0
## [2,]           0          1         0
## [3,]           0          0         1
ludecomp(correlation_inv %*% correlation)
## $lower_triangular_matrix
##      OverallQual    GarageCars     SalePrice
## [1,]           1 -4.440892e-16 -4.440892e-16
## [2,]           0  1.000000e+00  0.000000e+00
## [3,]           0  0.000000e+00  1.000000e+00
ludecomp(correlation)
## $lower_triangular_matrix
##      OverallQual GarageCars SalePrice
## [1,]           1  0.6006707 0.7909816
## [2,]           0  0.6391947 0.1652897
## [3,]           0  0.0000000 0.3316057
ludecomp(correlation_inv)
## $lower_triangular_matrix
##      OverallQual GarageCars SalePrice
## [1,]    2.782951 -0.4440392 -1.916896
## [2,]    0.000000  1.6952714 -1.085667
## [3,]    0.000000  0.0000000  1.000000

Calculus-Based Probability & Statistics.

Many times, it makes sense to fit a closed form distribution to data. For the first variable that you selected which is skewed to the right, shift it so that the minimum value is above zero as necessary.

We will shift our data so the minimum value is above zero by taking the log of the variable SalePrice.

manipulated_data = qualitiative_train
manipulated_data$SalePrice = log(manipulated_data$SalePrice)

saleprice_density_log <- manipulated_data %>%
  ggplot(aes(SalePrice)) + geom_density()
saleprice_density_log

Then load the MASS package and run fitdistr to fit an exponential probability density function. Find the optimal value of λ for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, λ)).

lambda = fitdistr(manipulated_data$SalePrice,"exponential")
distribution = rexp(1000,lambda$estimate)

lambda
##       rate    
##   0.083166647 
##  (0.002176571)

Plot a histogram and compare it with a histogram of your original variable.

hist(distribution, freq = FALSE, breaks = 100, main ="Fitted Exponential PDF with SalePrice", xlim = c(1, quantile(distribution, 0.99)))
curve(dexp(x, rate = lambda$estimate), col = "red", add = TRUE)

hist(X, freq = FALSE, breaks = 100, main ="Histogram of SalePrice")

Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

qexp(0.05, rate = lambda$estimate)
## [1] 0.6167532
qexp(0.95, rate = lambda$estimate)
## [1] 36.02084

Modeling

Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.

Data Manipulation

We are taking the log of these variables because they are skewed to the right. Taking the log is a good way to manipulated right skewed data because it is easy to interpret. However, many of the variables become -Inf when taking the log because many of them contain 0 values, therefore we have added a +1 to these variables.

manipulated_data = qualitiative_train
manipulated_data$BsmtFinSF1 = log(manipulated_data$BsmtFinSF1 + 1)
manipulated_data$BsmtFinSF2 = log(manipulated_data$BsmtFinSF2 + 1)
manipulated_data$BsmtUnfSF = log(manipulated_data$BsmtUnfSF + 1)
manipulated_data$EnclosedPorch = log(manipulated_data$EnclosedPorch + 1)
manipulated_data$GrLivArea = log(manipulated_data$GrLivArea)
manipulated_data$LowQualFinSF = log(manipulated_data$LowQualFinSF + 1)
manipulated_data$PoolArea = log(manipulated_data$PoolArea + 1)
manipulated_data$X2ndFlrSF = log(manipulated_data$X2ndFlrSF + 1)
manipulated_data$X1stFlrSF = log(manipulated_data$X1stFlrSF)
manipulated_data$WoodDeckSF = log(manipulated_data$WoodDeckSF + 1)
manipulated_data$ScreenPorch = log(manipulated_data$ScreenPorch + 1)
manipulated_data$TotalBsmtSF = log(manipulated_data$TotalBsmtSF + 1)
manipulated_data$MiscVal = log(manipulated_data$MiscVal + 1)
manipulated_data$X3SsnPorch = log(manipulated_data$X3SsnPorch + 1)

Notice that our dataset contains far fewer skewed datasets.

manipulated_density <- manipulated_data %>%
  gather() %>%                         
  ggplot(aes(value)) +                 
    facet_wrap(~ key, scales = "free") +
    geom_density()
manipulated_density
## Warning: Removed 348 rows containing non-finite values (stat_density).

We already have a pretty good R2 without removing variables from our model. However, there are many variables that are not statistically significant. We will remove these variables from our model.

sales_price_manipulated = lm(SalePrice ~ MSSubClass + LotFrontage + LotArea + OverallQual + OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF + TotalBsmtSF + X1stFlrSF + X2ndFlrSF + LowQualFinSF + GrLivArea + BsmtFullBath + BsmtHalfBath + FullBath + HalfBath + BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd + Fireplaces + GarageYrBlt + GarageCars + GarageArea + WoodDeckSF + OpenPorchSF + EnclosedPorch + X3SsnPorch + ScreenPorch + PoolArea + MiscVal + MoSold + YrSold, data = manipulated_data)
summary(sales_price_manipulated)
## 
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotFrontage + LotArea + 
##     OverallQual + OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + 
##     BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF + TotalBsmtSF + X1stFlrSF + 
##     X2ndFlrSF + LowQualFinSF + GrLivArea + BsmtFullBath + BsmtHalfBath + 
##     FullBath + HalfBath + BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd + 
##     Fireplaces + GarageYrBlt + GarageCars + GarageArea + WoodDeckSF + 
##     OpenPorchSF + EnclosedPorch + X3SsnPorch + ScreenPorch + 
##     PoolArea + MiscVal + MoSold + YrSold, data = manipulated_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -340749  -18339   -2669   15139  356950 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -8.052e+05  1.733e+06  -0.465 0.642357    
## MSSubClass    -1.600e+02  3.582e+01  -4.466 8.80e-06 ***
## LotFrontage   -8.554e+01  6.182e+01  -1.384 0.166721    
## LotArea        7.570e-01  1.584e-01   4.780 2.00e-06 ***
## OverallQual    1.964e+04  1.507e+03  13.033  < 2e-16 ***
## OverallCond    5.224e+03  1.396e+03   3.742 0.000192 ***
## YearBuilt      1.902e+02  9.032e+01   2.106 0.035453 *  
## YearRemodAdd   1.296e+02  8.857e+01   1.464 0.143612    
## MasVnrArea     3.991e+01  6.982e+00   5.716 1.41e-08 ***
## BsmtFinSF1     2.168e+03  5.408e+02   4.008 6.54e-05 ***
## BsmtFinSF2    -9.650e+02  6.962e+02  -1.386 0.166001    
## BsmtUnfSF     -7.796e+02  9.911e+02  -0.787 0.431662    
## TotalBsmtSF    9.659e+02  1.596e+03   0.605 0.545103    
## X1stFlrSF      1.948e+04  1.466e+04   1.328 0.184352    
## X2ndFlrSF     -3.124e+02  1.438e+03  -0.217 0.828023    
## LowQualFinSF  -4.516e+02  1.841e+03  -0.245 0.806264    
## GrLivArea      4.406e+04  1.834e+04   2.403 0.016449 *  
## BsmtFullBath   7.590e+03  3.091e+03   2.455 0.014226 *  
## BsmtHalfBath   1.569e+03  5.146e+03   0.305 0.760485    
## FullBath       1.004e+04  3.608e+03   2.782 0.005498 ** 
## HalfBath       5.998e+03  3.430e+03   1.749 0.080582 .  
## BedroomAbvGr  -1.051e+04  2.195e+03  -4.786 1.94e-06 ***
## KitchenAbvGr  -2.756e+04  6.985e+03  -3.945 8.48e-05 ***
## TotRmsAbvGrd   7.922e+03  1.486e+03   5.332 1.18e-07 ***
## Fireplaces     3.834e+03  2.239e+03   1.713 0.087030 .  
## GarageYrBlt   -2.858e+01  9.289e+01  -0.308 0.758340    
## GarageCars     1.502e+04  3.546e+03   4.238 2.45e-05 ***
## GarageArea     1.569e+01  1.224e+01   1.282 0.200133    
## WoodDeckSF     7.261e+02  4.924e+02   1.474 0.140642    
## OpenPorchSF    1.733e+01  1.954e+01   0.887 0.375406    
## EnclosedPorch  7.966e+02  7.671e+02   1.038 0.299337    
## X3SsnPorch     1.256e+03  1.733e+03   0.725 0.468605    
## ScreenPorch    1.660e+03  8.179e+02   2.030 0.042602 *  
## PoolArea      -8.109e+02  2.592e+03  -0.313 0.754482    
## MiscVal       -8.508e+02  1.038e+03  -0.820 0.412551    
## MoSold        -3.008e+02  4.297e+02  -0.700 0.483954    
## YrSold        -1.271e+02  8.608e+02  -0.148 0.882620    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 37370 on 1084 degrees of freedom
##   (339 observations deleted due to missingness)
## Multiple R-squared:  0.8038, Adjusted R-squared:  0.7973 
## F-statistic: 123.4 on 36 and 1084 DF,  p-value: < 2.2e-16

We will remove variables manually using manual backward selection and selection criteria p-value of .05.

sales_price_manipulated_backward_elimination = lm(SalePrice ~ MSSubClass + LotArea + OverallQual + OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 + X1stFlrSF + GrLivArea + BsmtFullBath + FullBath + BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd + Fireplaces + GarageCars + WoodDeckSF + ScreenPorch, data = manipulated_data)
summary(sales_price_manipulated_backward_elimination)
## 
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotArea + OverallQual + 
##     OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 + 
##     X1stFlrSF + GrLivArea + BsmtFullBath + FullBath + BedroomAbvGr + 
##     KitchenAbvGr + TotRmsAbvGrd + Fireplaces + GarageCars + WoodDeckSF + 
##     ScreenPorch, data = manipulated_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -338848  -18230   -2509   15168  359778 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.313e+06  1.377e+05  -9.535  < 2e-16 ***
## MSSubClass   -1.480e+02  2.719e+01  -5.443 6.16e-08 ***
## LotArea       5.438e-01  1.016e-01   5.350 1.02e-07 ***
## OverallQual   1.880e+04  1.183e+03  15.888  < 2e-16 ***
## OverallCond   3.936e+03  1.040e+03   3.786 0.000159 ***
## YearBuilt     2.309e+02  5.521e+01   4.181 3.07e-05 ***
## YearRemodAdd  1.869e+02  6.761e+01   2.764 0.005778 ** 
## MasVnrArea    4.261e+01  5.954e+00   7.158 1.30e-12 ***
## BsmtFinSF1    1.682e+03  4.106e+02   4.096 4.44e-05 ***
## X1stFlrSF     1.804e+04  4.435e+03   4.068 5.00e-05 ***
## GrLivArea     4.902e+04  6.900e+03   7.104 1.91e-12 ***
## BsmtFullBath  9.713e+03  2.377e+03   4.086 4.62e-05 ***
## FullBath      7.538e+03  2.662e+03   2.832 0.004692 ** 
## BedroomAbvGr -1.121e+04  1.729e+03  -6.486 1.21e-10 ***
## KitchenAbvGr -1.806e+04  5.298e+03  -3.409 0.000670 ***
## TotRmsAbvGrd  8.036e+03  1.227e+03   6.552 7.91e-11 ***
## Fireplaces    3.789e+03  1.812e+03   2.091 0.036695 *  
## GarageCars    1.003e+04  1.746e+03   5.745 1.12e-08 ***
## WoodDeckSF    8.993e+02  3.951e+02   2.276 0.022979 *  
## ScreenPorch   1.559e+03  6.904e+02   2.258 0.024118 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35640 on 1432 degrees of freedom
##   (8 observations deleted due to missingness)
## Multiple R-squared:  0.8006, Adjusted R-squared:  0.798 
## F-statistic: 302.6 on 19 and 1432 DF,  p-value: < 2.2e-16

Notice there is not a significant change with the R2, however the F-statistic has increased, likely because it is affected by the number of variables in the model (the lesser the better).

Using Test Data to Generate Output

Finally, we will perform the same data manipulation on the variables that are skewed in our test data and use the predict() function on our linear regression model generated above to predict sales prices in the test data.

test_manipulation = test
test_manipulation$BsmtFinSF1 = log(test_manipulation$BsmtFinSF1 + 1)
test_manipulation$BsmtFinSF2 = log(test_manipulation$BsmtFinSF2 + 1)
test_manipulation$BsmtUnfSF = log(test_manipulation$BsmtUnfSF + 1)
test_manipulation$EnclosedPorch = log(test_manipulation$EnclosedPorch + 1)
test_manipulation$GrLivArea = log(test_manipulation$GrLivArea)
test_manipulation$LowQualFinSF = log(test_manipulation$LowQualFinSF + 1)
test_manipulation$PoolArea = log(test_manipulation$PoolArea + 1)
test_manipulation$X2ndFlrSF = log(test_manipulation$X2ndFlrSF + 1)
test_manipulation$X1stFlrSF = log(test_manipulation$X1stFlrSF)
test_manipulation$WoodDeckSF = log(test_manipulation$WoodDeckSF + 1)
test_manipulation$ScreenPorch = log(test_manipulation$ScreenPorch + 1)
test_manipulation$TotalBsmtSF = log(test_manipulation$TotalBsmtSF + 1)
test_manipulation$MiscVal = log(test_manipulation$MiscVal + 1)
test_manipulation$X3SsnPorch = log(test_manipulation$X3SsnPorch + 1)
test_manipulation$GarageCars = 

test_manipulation$GarageCars[is.na(test_manipulation$GarageCars)] <- 0
test_manipulation$BsmtFullBath[is.na(test_manipulation$BsmtFullBath)] <- 0

test_manipulation$SalePrice <- predict(sales_price_manipulated_backward_elimination,test_manipulation,type='response')

head(test_manipulation)

Kaggle

Here, we are exporting our data to send to the Kaggle Compeition.

Username: MicheleBradley

Score: 0.87098

headers = c("Id", "SalePrice")
kaggle_results = test_manipulation[headers]
#write.csv(kaggle_results, "kaggle_results.csv")