#library(dplyr)
#library(knitr)
#library(moments)
#library(kableExtra)
#library(reshape2)
#library(ggplot2)
#library(tidyr)
#library(corrplot)
#library(MASS)
train = read.csv("/Users/Michele/Desktop/data_605_final/train.csv", as.is = TRUE)
test = read.csv("/Users/Michele/Desktop/data_605_final/test.csv", as.is = TRUE)
#filter out qualitative datasets
qualitiative_train = select_if(train, is.numeric)
– Pick one of the quantitative independent variables from the training data set (train.csv) , and define that variable as X. Make sure this variable is skewed to the right!
This is a good independent variable to determine and it is skewed to the right.
saleprice_density <- qualitiative_train %>%
ggplot(aes(SalePrice)) + geom_density()
saleprice_density
X = qualitiative_train$SalePrice
summary(X)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34900 129975 163000 180921 214000 755000
– Pick the dependent variable and define it as Y.
A good variable to use to determine SalePrice would be GrLivArea, which indicates the amount of above ground living area for the building.
GrLivArea_density <- qualitiative_train %>%
ggplot(aes(GrLivArea)) + geom_density()
GrLivArea_density
Y = qualitiative_train$GrLivArea
summary(Y)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 334 1130 1464 1515 1777 5642
Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 1st quartile of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities. In addition, make a table of counts as shown below.
#subset data we desire
variables <- c("SalePrice", "GrLivArea")
data <- train[variables]
#obtain 25 percentiles
x = quantile(X, 0.25)
y = quantile(Y, 0.25)
x
## 25%
## 129975
y
## 25%
## 1129.5
Here, we are calculating the probability Y>y and the probabiltity X>x AND Y>y to determine the probabiltity X>x given Y>y.
\[P(B|A) = \dfrac{P(A \ and \ B)}{P(A)}\]
#calculate number of X > 25% of X
greater_X = X[X>x]
#calculate number of Y > 25% of Y
greater_Y = Y[Y>y]
#calculate number of X > 25% of X AND Y > 25% of Y
data$results <- ifelse(
((data$SalePrice > x) & (data$GrLivArea > y)), 1, 0)
# P(X>x|Y>y) = {P(Y>y and X>x)}{P(Y>y)}
(sum(data$results)/length(data$results))/ (length(greater_Y)/length(Y))
## [1] 0.8712329
Here, we are calculating the probability that either variable is greater than the 25th percentile, then subtracting the probability that both variables are greater than then 25th percentile.
\[P(A \ \cup \ B) = P(A) \ + \ P(B) - P(A \ \cap \ B)\]
#calculate number of X > 25% of X
greater_X = X[X>x]
#calculate number of Y > 25% of Y
greater_Y = Y[Y>y]
#calculate number of X > 25% of X AND Y > 25% of Y
data$results <- ifelse(
((data$SalePrice > x) & (data$GrLivArea > y)), 1, 0)
# P(X>x) + P(Y>y) - P(X>x) U P(Y>y)
((length(greater_X)/length(X)) + (length(greater_Y)/length(Y))) - (sum(data$results)/length(data$results))
## [1] 0.8465753
Here, we are calculating the probability Y>y and the probabiltity X
\[P(B|A) = \dfrac{P(A \ and \ B)}{P(A)}\]
#calculate number of X < 25% of X
less_X = X[X<x]
#calculate number of Y > 25% of Y
greater_Y = Y[Y>y]
#calculate number of X > 25% of X AND Y > 25% of Y
data$results <- ifelse(
((data$SalePrice < x) & (data$GrLivArea > y)), 1, 0)
# P(X>x|Y>y) = {P(Y>y and X>x)}{P(Y>y)}
(sum(data$results)/length(data$results))/ (length(greater_Y)/length(Y))
## [1] 0.1287671
both_greater = nrow(subset(data, GrLivArea > y & SalePrice > x))
x_greater_y_less = nrow(subset(data, GrLivArea < y & SalePrice > x))
both_less = nrow(subset(data, GrLivArea < y & SalePrice < x))
x_less_y_greater = nrow(subset(data, GrLivArea > y & SalePrice < x))
x_y = c("greater first quartile", "less first quartile", "Total")
greater_first_quartile = c(both_greater, x_less_y_greater, both_greater + x_less_y_greater)
less_first_quartile = c(x_greater_y_less, both_less, x_greater_y_less + both_less)
Total = c(both_greater+x_greater_y_less, x_less_y_greater+both_less, both_greater + x_less_y_greater + x_greater_y_less + both_less)
data.frame(x_y, greater_first_quartile, less_first_quartile, Total)
n = (nrow(data))
x_y = c("greater first quartile", "less first quartile", "Total")
greater_first_quartile = c(both_greater/n, x_less_y_greater/n, both_greater/n + x_less_y_greater/n)
less_first_quartile = c(x_greater_y_less/n, both_less/n, x_greater_y_less/n + both_less/n)
Total = c(both_greater/n+x_greater_y_less/n, x_less_y_greater/n+both_less/n, both_greater/n + x_less_y_greater/n + x_greater_y_less/n + both_less/n)
data.frame(x_y, greater_first_quartile, less_first_quartile, Total)
Does splitting the training data in this fashion make them independent? Let A be the new variable counting those observations above the 1st quartile for X, and let B be the new variable counting those observations above the 1st quartile for Y. Does P(AB)=P(A)P(B)? Check mathematically, and then evaluate by running a Chi Square test for association.
P(AB) =
A = length(X[X>x])
B = length(Y[Y>y])
data$results <- ifelse(
((data$SalePrice > x) & (data$GrLivArea > y)), 1, 0)
((A/length(X)) + (B/length(Y))) - (sum(data$results)/length(data$results))
## [1] 0.8465753
P(A)*P(B) =
((A/length(X)) * (B/length(Y)))
## [1] 0.5625
Splitting the training data in this fashion does not make them independent since we expect Gross Living Area to affect the Sales Price.
The chi square test for independence compares two variables to see if they are related. A small chi-sqare p-value demonstrates that there is a significant association between the two variables. We are testing SalesPrice and Gross Living Area below. Notice that the chi-square value is very low at 2.2e-16, indicating that there is a significant association between these two variables and that therefore these variables are not independent of one another.
chisq.test(table(data$SalePrice, data$GrLivArea))
## Warning in chisq.test(table(data$SalePrice, data$GrLivArea)): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: table(data$SalePrice, data$GrLivArea)
## X-squared = 589730, df = 569320, p-value < 2.2e-16
Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot of X and Y. Derive a correlation matrix for any THREE quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide a 92% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?
#generate clean summary table
training_summary_df <- do.call(data.frame,
list(n = sapply(qualitiative_train, function(x) length(x[!is.na(x)])),
mean = round(sapply(qualitiative_train, function(x) mean(x, na.rm=TRUE)), 2),
median = round(sapply(qualitiative_train, function(x) median(x, na.rm=TRUE)), 2),
sd = round(sapply(qualitiative_train, function(x) sd(x, na.rm=TRUE)), 2),
se = round(sapply(qualitiative_train, function(x) sd(x,na.rm=TRUE)/(sqrt(length(x[!is.na(x)])))), 2),
min = round(sapply(qualitiative_train, function(x) min(x, na.rm=TRUE)), 2),
max = round(sapply(qualitiative_train, function(x) max(x, na.rm=TRUE)), 2),
range = round(sapply(qualitiative_train, function(x) max(x, na.rm=TRUE) - min(x, na.rm=TRUE)), 2),
kurtosis = round(sapply(qualitiative_train, function(x) kurtosis(x, na.rm=TRUE))), 2))
training_summary_df
meltData <- melt(qualitiative_train)
## No id variables; using all as measure variables
p <- ggplot(meltData, aes(factor(variable), value))
p + geom_boxplot() + facet_wrap(~variable, scale="free")
## Warning: Removed 348 rows containing non-finite values (stat_boxplot).
density <- qualitiative_train %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_density()
density
## Warning: Removed 348 rows containing non-finite values (stat_density).
headers = c("OverallQual", "GarageCars", "SalePrice")
correlation = cor(qualitiative_train[headers])
correlation
## OverallQual GarageCars SalePrice
## OverallQual 1.0000000 0.6006707 0.7909816
## GarageCars 0.6006707 1.0000000 0.6404092
## SalePrice 0.7909816 0.6404092 1.0000000
corrplot(correlation, method="square")
The first correlation we will visit is between SalePrice and OverallQual (a variables that rates the overall material and finish of the house). Below is a correlation t-test used to determine if there is an association between paired samples. Below, we are showing that using a 92% confidence interval we can reject the null hypothesis that the true correlation between overall quality and sale price is 0. Sales Price and Overall Quality has a good correlation of .79. We are 92% confident that the true correlation is between .77 and .81.
cor.test(train$OverallQual, train$SalePrice, method = "pearson" , conf.level = 0.92)
##
## Pearson's product-moment correlation
##
## data: train$OverallQual and train$SalePrice
## t = 49.364, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
## 0.7731789 0.8075387
## sample estimates:
## cor
## 0.7909816
Below is a linear regression summary of the data.
sale_price_overall_qual = lm(qualitiative_train$SalePrice ~ qualitiative_train$OverallQual)
summary(sale_price_overall_qual)
##
## Call:
## lm(formula = qualitiative_train$SalePrice ~ qualitiative_train$OverallQual)
##
## Residuals:
## Min 1Q Median 3Q Max
## -198152 -29409 -1845 21463 396848
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -96206.1 5756.4 -16.71 <2e-16 ***
## qualitiative_train$OverallQual 45435.8 920.4 49.36 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 48620 on 1458 degrees of freedom
## Multiple R-squared: 0.6257, Adjusted R-squared: 0.6254
## F-statistic: 2437 on 1 and 1458 DF, p-value: < 2.2e-16
plot(qualitiative_train$OverallQual, qualitiative_train$SalePrice)
abline(sale_price_overall_qual)
The second correlation we will visit is between SalePrice and GarageCars (the size of garage in car capacity). Below is a correlation t-test used to determine if there is an association between paired samples. Below, we are showing that using a 92% confidence interval we can reject the null hypothesis that the true correlation between garage cars and sale price is 0. Sales Price and Garage Cars has a good correlation of .64. We are 92% confident that the true correlation is between .61 and .67.
cor.test(train$GarageCars, train$SalePrice, method = "pearson" , conf.level = 0.92)
##
## Pearson's product-moment correlation
##
## data: train$GarageCars and train$SalePrice
## t = 31.839, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
## 0.6125561 0.6666738
## sample estimates:
## cor
## 0.6404092
Below is a linear regression summary of the data.
sale_price_overall_qual = lm(qualitiative_train$SalePrice ~ qualitiative_train$GarageCars)
summary(sale_price_overall_qual)
##
## Call:
## lm(formula = qualitiative_train$SalePrice ~ qualitiative_train$GarageCars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -209931 -36775 -6775 25303 490147
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60619 4102 14.78 <2e-16 ***
## qualitiative_train$GarageCars 68078 2138 31.84 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 61040 on 1458 degrees of freedom
## Multiple R-squared: 0.4101, Adjusted R-squared: 0.4097
## F-statistic: 1014 on 1 and 1458 DF, p-value: < 2.2e-16
plot(qualitiative_train$GarageCars, qualitiative_train$SalePrice)
abline(sale_price_overall_qual)
The final pairwise correlation we will visit is between OverallQual ((a variables that rates the overall material and finish of the hous) and GarageCars (the size of garage in car capacity). Below is a correlation t-test used to determine if there is an association between paired samples. Below, we are showing that using a 92% confidence interval we can reject the null hypothesis that the true correlation between garage cars and sale price is 0. Overall Quality and Garage Cars has a good correlation of .60. We are 92% confident that the true correlation is between .57 and .62.
cor.test(train$GarageCars, train$OverallQual, method = "pearson" , conf.level = 0.92)
##
## Pearson's product-moment correlation
##
## data: train$GarageCars and train$OverallQual
## t = 28.688, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
## 0.5705454 0.6291817
## sample estimates:
## cor
## 0.6006707
Below is a linear regression summary of the data.
sale_price_overall_qual = lm(qualitiative_train$OverallQual ~ qualitiative_train$GarageCars)
summary(sale_price_overall_qual)
##
## Call:
## lm(formula = qualitiative_train$OverallQual ~ qualitiative_train$GarageCars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5814 -0.3582 -0.2466 0.6418 3.8650
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.13496 0.07434 55.62 <2e-16 ***
## qualitiative_train$GarageCars 1.11161 0.03875 28.69 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.106 on 1458 degrees of freedom
## Multiple R-squared: 0.3608, Adjusted R-squared: 0.3604
## F-statistic: 823 on 1 and 1458 DF, p-value: < 2.2e-16
plot(qualitiative_train$GarageCars, qualitiative_train$OverallQual)
abline(sale_price_overall_qual)
We should be worried about Type II error when performing analysis with this kind of data because there are a few variables that are correlated with each other and may influence the other. For example, the number of Garage Cars likely influences the overall quality of the house. Therefore we have potential multicolinearity in this model, and we should look out for incorporating many related variables and determining we have a good multiple linear regression model when in actuality we are generating a Type II error, where we retain a false hypothesis.
Invert your 3 x 3 correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.
#inverse of correlation matrix
correlation_inv = solve(correlation)
correlation_inv
## OverallQual GarageCars SalePrice
## OverallQual 2.7829510 -0.4440392 -1.9168963
## GarageCars -0.4440392 1.7661209 -0.7798133
## SalePrice -1.9168963 -0.7798133 3.0156293
#multiply the precision matrix by the correlation matrix
correlation_inv %*% correlation
## OverallQual GarageCars SalePrice
## OverallQual 1.000000e+00 -4.440892e-16 -4.440892e-16
## GarageCars 0.000000e+00 1.000000e+00 0.000000e+00
## SalePrice 2.220446e-16 0.000000e+00 1.000000e+00
#multiply the correlation matrix by the precision matrix
correlation %*% correlation_inv
## OverallQual GarageCars SalePrice
## OverallQual 1.000000e+00 0 0
## GarageCars -2.220446e-16 1 0
## SalePrice -4.440892e-16 0 1
ludecomp <- function(A){
if (nrow(A) == 2){
value <- -A[2,1] / A[1,1]
multiply <- matrix(c(1, value, 0, 1), nrow=2, ncol=2)
lower_triangular_matrix <- multiply %*% A
} else if (nrow(A) == 3){
value <- (A[,1]/(-1*A[1,1]))
multiply <- matrix(c(1, value[2], value[3], 0, 1, 0, 0, 0, 1), nrow=3, ncol=3)
transformation_1 <- multiply %*% A
value_2 <- -1*(transformation_1[3,2]/transformation_1[2,2])
multiply_2 <- matrix(c(1, 0, 0, 0, 1, value_2, 0, 0, 1), nrow=3, ncol=3)
lower_triangular_matrix <- multiply_2 %*% transformation_1
} else if (nrow(A) == 4){
value <- (A[,1]/(-1*A[1,1]))
multiply <- matrix(c(1, value[2], value[3], value[4], 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1), nrow=4, ncol=4)
transformation_1 <- multiply %*% A
value_1 <- (transformation_1[,2]/(-1*transformation_1[2,2]))
multiply_1 <- matrix(c(1, 0, 0, 0, 0, 1, value_1[3], value_1[4], 0, 0, 1, 0, 0, 0, 0, 1), nrow=4, ncol=4)
transformation_2 <- round(multiply_1 %*% transformation_1, 1)
value_2 <- round((-1*(transformation_2[4,3])/(transformation_2[3,3])), 1)
multiply_2 <- matrix(c(1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, value_2, 0, 0, 0, 1), nrow=4, ncol=4)
lower_triangular_matrix <- round(multiply_2 %*% transformation_2, 1) #account for rounding errors
}
return (list(lower_triangular_matrix = lower_triangular_matrix))
}
ludecomp(correlation %*% correlation_inv)
## $lower_triangular_matrix
## OverallQual GarageCars SalePrice
## [1,] 1 0 0
## [2,] 0 1 0
## [3,] 0 0 1
ludecomp(correlation_inv %*% correlation)
## $lower_triangular_matrix
## OverallQual GarageCars SalePrice
## [1,] 1 -4.440892e-16 -4.440892e-16
## [2,] 0 1.000000e+00 0.000000e+00
## [3,] 0 0.000000e+00 1.000000e+00
ludecomp(correlation)
## $lower_triangular_matrix
## OverallQual GarageCars SalePrice
## [1,] 1 0.6006707 0.7909816
## [2,] 0 0.6391947 0.1652897
## [3,] 0 0.0000000 0.3316057
ludecomp(correlation_inv)
## $lower_triangular_matrix
## OverallQual GarageCars SalePrice
## [1,] 2.782951 -0.4440392 -1.916896
## [2,] 0.000000 1.6952714 -1.085667
## [3,] 0.000000 0.0000000 1.000000
Many times, it makes sense to fit a closed form distribution to data. For the first variable that you selected which is skewed to the right, shift it so that the minimum value is above zero as necessary.
We will shift our data so the minimum value is above zero by taking the log of the variable SalePrice.
manipulated_data = qualitiative_train
manipulated_data$SalePrice = log(manipulated_data$SalePrice)
saleprice_density_log <- manipulated_data %>%
ggplot(aes(SalePrice)) + geom_density()
saleprice_density_log
Then load the MASS package and run fitdistr to fit an exponential probability density function. Find the optimal value of λ for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, λ)).
lambda = fitdistr(manipulated_data$SalePrice,"exponential")
distribution = rexp(1000,lambda$estimate)
lambda
## rate
## 0.083166647
## (0.002176571)
Plot a histogram and compare it with a histogram of your original variable.
hist(distribution, freq = FALSE, breaks = 100, main ="Fitted Exponential PDF with SalePrice", xlim = c(1, quantile(distribution, 0.99)))
curve(dexp(x, rate = lambda$estimate), col = "red", add = TRUE)
hist(X, freq = FALSE, breaks = 100, main ="Histogram of SalePrice")
Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.
qexp(0.05, rate = lambda$estimate)
## [1] 0.6167532
qexp(0.95, rate = lambda$estimate)
## [1] 36.02084
Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.
We are taking the log of these variables because they are skewed to the right. Taking the log is a good way to manipulated right skewed data because it is easy to interpret. However, many of the variables become -Inf when taking the log because many of them contain 0 values, therefore we have added a +1 to these variables.
manipulated_data = qualitiative_train
manipulated_data$BsmtFinSF1 = log(manipulated_data$BsmtFinSF1 + 1)
manipulated_data$BsmtFinSF2 = log(manipulated_data$BsmtFinSF2 + 1)
manipulated_data$BsmtUnfSF = log(manipulated_data$BsmtUnfSF + 1)
manipulated_data$EnclosedPorch = log(manipulated_data$EnclosedPorch + 1)
manipulated_data$GrLivArea = log(manipulated_data$GrLivArea)
manipulated_data$LowQualFinSF = log(manipulated_data$LowQualFinSF + 1)
manipulated_data$PoolArea = log(manipulated_data$PoolArea + 1)
manipulated_data$X2ndFlrSF = log(manipulated_data$X2ndFlrSF + 1)
manipulated_data$X1stFlrSF = log(manipulated_data$X1stFlrSF)
manipulated_data$WoodDeckSF = log(manipulated_data$WoodDeckSF + 1)
manipulated_data$ScreenPorch = log(manipulated_data$ScreenPorch + 1)
manipulated_data$TotalBsmtSF = log(manipulated_data$TotalBsmtSF + 1)
manipulated_data$MiscVal = log(manipulated_data$MiscVal + 1)
manipulated_data$X3SsnPorch = log(manipulated_data$X3SsnPorch + 1)
Notice that our dataset contains far fewer skewed datasets.
manipulated_density <- manipulated_data %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_density()
manipulated_density
## Warning: Removed 348 rows containing non-finite values (stat_density).
We already have a pretty good R2 without removing variables from our model. However, there are many variables that are not statistically significant. We will remove these variables from our model.
sales_price_manipulated = lm(SalePrice ~ MSSubClass + LotFrontage + LotArea + OverallQual + OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF + TotalBsmtSF + X1stFlrSF + X2ndFlrSF + LowQualFinSF + GrLivArea + BsmtFullBath + BsmtHalfBath + FullBath + HalfBath + BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd + Fireplaces + GarageYrBlt + GarageCars + GarageArea + WoodDeckSF + OpenPorchSF + EnclosedPorch + X3SsnPorch + ScreenPorch + PoolArea + MiscVal + MoSold + YrSold, data = manipulated_data)
summary(sales_price_manipulated)
##
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotFrontage + LotArea +
## OverallQual + OverallCond + YearBuilt + YearRemodAdd + MasVnrArea +
## BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF + TotalBsmtSF + X1stFlrSF +
## X2ndFlrSF + LowQualFinSF + GrLivArea + BsmtFullBath + BsmtHalfBath +
## FullBath + HalfBath + BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd +
## Fireplaces + GarageYrBlt + GarageCars + GarageArea + WoodDeckSF +
## OpenPorchSF + EnclosedPorch + X3SsnPorch + ScreenPorch +
## PoolArea + MiscVal + MoSold + YrSold, data = manipulated_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -340749 -18339 -2669 15139 356950
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.052e+05 1.733e+06 -0.465 0.642357
## MSSubClass -1.600e+02 3.582e+01 -4.466 8.80e-06 ***
## LotFrontage -8.554e+01 6.182e+01 -1.384 0.166721
## LotArea 7.570e-01 1.584e-01 4.780 2.00e-06 ***
## OverallQual 1.964e+04 1.507e+03 13.033 < 2e-16 ***
## OverallCond 5.224e+03 1.396e+03 3.742 0.000192 ***
## YearBuilt 1.902e+02 9.032e+01 2.106 0.035453 *
## YearRemodAdd 1.296e+02 8.857e+01 1.464 0.143612
## MasVnrArea 3.991e+01 6.982e+00 5.716 1.41e-08 ***
## BsmtFinSF1 2.168e+03 5.408e+02 4.008 6.54e-05 ***
## BsmtFinSF2 -9.650e+02 6.962e+02 -1.386 0.166001
## BsmtUnfSF -7.796e+02 9.911e+02 -0.787 0.431662
## TotalBsmtSF 9.659e+02 1.596e+03 0.605 0.545103
## X1stFlrSF 1.948e+04 1.466e+04 1.328 0.184352
## X2ndFlrSF -3.124e+02 1.438e+03 -0.217 0.828023
## LowQualFinSF -4.516e+02 1.841e+03 -0.245 0.806264
## GrLivArea 4.406e+04 1.834e+04 2.403 0.016449 *
## BsmtFullBath 7.590e+03 3.091e+03 2.455 0.014226 *
## BsmtHalfBath 1.569e+03 5.146e+03 0.305 0.760485
## FullBath 1.004e+04 3.608e+03 2.782 0.005498 **
## HalfBath 5.998e+03 3.430e+03 1.749 0.080582 .
## BedroomAbvGr -1.051e+04 2.195e+03 -4.786 1.94e-06 ***
## KitchenAbvGr -2.756e+04 6.985e+03 -3.945 8.48e-05 ***
## TotRmsAbvGrd 7.922e+03 1.486e+03 5.332 1.18e-07 ***
## Fireplaces 3.834e+03 2.239e+03 1.713 0.087030 .
## GarageYrBlt -2.858e+01 9.289e+01 -0.308 0.758340
## GarageCars 1.502e+04 3.546e+03 4.238 2.45e-05 ***
## GarageArea 1.569e+01 1.224e+01 1.282 0.200133
## WoodDeckSF 7.261e+02 4.924e+02 1.474 0.140642
## OpenPorchSF 1.733e+01 1.954e+01 0.887 0.375406
## EnclosedPorch 7.966e+02 7.671e+02 1.038 0.299337
## X3SsnPorch 1.256e+03 1.733e+03 0.725 0.468605
## ScreenPorch 1.660e+03 8.179e+02 2.030 0.042602 *
## PoolArea -8.109e+02 2.592e+03 -0.313 0.754482
## MiscVal -8.508e+02 1.038e+03 -0.820 0.412551
## MoSold -3.008e+02 4.297e+02 -0.700 0.483954
## YrSold -1.271e+02 8.608e+02 -0.148 0.882620
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 37370 on 1084 degrees of freedom
## (339 observations deleted due to missingness)
## Multiple R-squared: 0.8038, Adjusted R-squared: 0.7973
## F-statistic: 123.4 on 36 and 1084 DF, p-value: < 2.2e-16
We will remove variables manually using manual backward selection and selection criteria p-value of .05.
sales_price_manipulated_backward_elimination = lm(SalePrice ~ MSSubClass + LotArea + OverallQual + OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 + X1stFlrSF + GrLivArea + BsmtFullBath + FullBath + BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd + Fireplaces + GarageCars + WoodDeckSF + ScreenPorch, data = manipulated_data)
summary(sales_price_manipulated_backward_elimination)
##
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotArea + OverallQual +
## OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 +
## X1stFlrSF + GrLivArea + BsmtFullBath + FullBath + BedroomAbvGr +
## KitchenAbvGr + TotRmsAbvGrd + Fireplaces + GarageCars + WoodDeckSF +
## ScreenPorch, data = manipulated_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -338848 -18230 -2509 15168 359778
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.313e+06 1.377e+05 -9.535 < 2e-16 ***
## MSSubClass -1.480e+02 2.719e+01 -5.443 6.16e-08 ***
## LotArea 5.438e-01 1.016e-01 5.350 1.02e-07 ***
## OverallQual 1.880e+04 1.183e+03 15.888 < 2e-16 ***
## OverallCond 3.936e+03 1.040e+03 3.786 0.000159 ***
## YearBuilt 2.309e+02 5.521e+01 4.181 3.07e-05 ***
## YearRemodAdd 1.869e+02 6.761e+01 2.764 0.005778 **
## MasVnrArea 4.261e+01 5.954e+00 7.158 1.30e-12 ***
## BsmtFinSF1 1.682e+03 4.106e+02 4.096 4.44e-05 ***
## X1stFlrSF 1.804e+04 4.435e+03 4.068 5.00e-05 ***
## GrLivArea 4.902e+04 6.900e+03 7.104 1.91e-12 ***
## BsmtFullBath 9.713e+03 2.377e+03 4.086 4.62e-05 ***
## FullBath 7.538e+03 2.662e+03 2.832 0.004692 **
## BedroomAbvGr -1.121e+04 1.729e+03 -6.486 1.21e-10 ***
## KitchenAbvGr -1.806e+04 5.298e+03 -3.409 0.000670 ***
## TotRmsAbvGrd 8.036e+03 1.227e+03 6.552 7.91e-11 ***
## Fireplaces 3.789e+03 1.812e+03 2.091 0.036695 *
## GarageCars 1.003e+04 1.746e+03 5.745 1.12e-08 ***
## WoodDeckSF 8.993e+02 3.951e+02 2.276 0.022979 *
## ScreenPorch 1.559e+03 6.904e+02 2.258 0.024118 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35640 on 1432 degrees of freedom
## (8 observations deleted due to missingness)
## Multiple R-squared: 0.8006, Adjusted R-squared: 0.798
## F-statistic: 302.6 on 19 and 1432 DF, p-value: < 2.2e-16
Notice there is not a significant change with the R2, however the F-statistic has increased, likely because it is affected by the number of variables in the model (the lesser the better).
Finally, we will perform the same data manipulation on the variables that are skewed in our test data and use the predict() function on our linear regression model generated above to predict sales prices in the test data.
test_manipulation = test
test_manipulation$BsmtFinSF1 = log(test_manipulation$BsmtFinSF1 + 1)
test_manipulation$BsmtFinSF2 = log(test_manipulation$BsmtFinSF2 + 1)
test_manipulation$BsmtUnfSF = log(test_manipulation$BsmtUnfSF + 1)
test_manipulation$EnclosedPorch = log(test_manipulation$EnclosedPorch + 1)
test_manipulation$GrLivArea = log(test_manipulation$GrLivArea)
test_manipulation$LowQualFinSF = log(test_manipulation$LowQualFinSF + 1)
test_manipulation$PoolArea = log(test_manipulation$PoolArea + 1)
test_manipulation$X2ndFlrSF = log(test_manipulation$X2ndFlrSF + 1)
test_manipulation$X1stFlrSF = log(test_manipulation$X1stFlrSF)
test_manipulation$WoodDeckSF = log(test_manipulation$WoodDeckSF + 1)
test_manipulation$ScreenPorch = log(test_manipulation$ScreenPorch + 1)
test_manipulation$TotalBsmtSF = log(test_manipulation$TotalBsmtSF + 1)
test_manipulation$MiscVal = log(test_manipulation$MiscVal + 1)
test_manipulation$X3SsnPorch = log(test_manipulation$X3SsnPorch + 1)
test_manipulation$GarageCars =
test_manipulation$GarageCars[is.na(test_manipulation$GarageCars)] <- 0
test_manipulation$BsmtFullBath[is.na(test_manipulation$BsmtFullBath)] <- 0
test_manipulation$SalePrice <- predict(sales_price_manipulated_backward_elimination,test_manipulation,type='response')
head(test_manipulation)
Here, we are exporting our data to send to the Kaggle Compeition.
Username: MicheleBradley
Score: 0.87098
headers = c("Id", "SalePrice")
kaggle_results = test_manipulation[headers]
#write.csv(kaggle_results, "kaggle_results.csv")