Created my tibbles, loaded data from the file Loaded the file, and split into train and test tibbles.
Ran a histogram on all continuous variables of interest.
options(scipen = 99)
hist(t$overall_qual)
hist(t$gr_liv_area)
hist(t$garage_area)
hist(t$total_bsmt_sf)
hist(t$kitchen_qual)
Examined correlation + pairs. Made sure that I have linear relationships with no outliers.
library(ggcorrplot)
t_plot <- t %>%
select(sale_price, overall_qual,gr_liv_area,garage_area,total_bsmt_sf, kitchen_qual)
ggcorrplot::ggcorrplot(cor(t_plot))
# Shows that all the independent variables picked are significantly impacting sale price
Made my lm model.
model <- lm(sale_price ~ overall_qual + gr_liv_area + garage_area + total_bsmt_sf + kitchen_qual, data = t_train)
#This summary shows every variable has significant positive relationship with sale price
summary(model)
##
## Call:
## lm(formula = sale_price ~ overall_qual + gr_liv_area + garage_area +
## total_bsmt_sf + kitchen_qual, data = t_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -515898 -19152 -1498 16198 277810
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -137561.000 3993.693 -34.45 <0.0000000000000002 ***
## overall_qual 19600.457 816.367 24.01 <0.0000000000000002 ***
## gr_liv_area 45.582 1.805 25.25 <0.0000000000000002 ***
## garage_area 52.180 4.280 12.19 <0.0000000000000002 ***
## total_bsmt_sf 29.375 2.031 14.46 <0.0000000000000002 ***
## kitchen_qual 21362.092 1486.776 14.37 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36590 on 2583 degrees of freedom
## Multiple R-squared: 0.7938, Adjusted R-squared: 0.7934
## F-statistic: 1989 on 5 and 2583 DF, p-value: < 0.00000000000000022
# These lines calculate the predicted values and residuals for both the training and test data sets based on the fitted model
t_train <- t_train %>%
mutate(predicted = predict(model, newdata = t_train),
residual = predicted - sale_price)
t_test <- t_test %>%
mutate(predicted = predict(model, newdata = t_test),
residual = predicted - sale_price)
# This line calculates the mean of the residuals for the test dataset. It provides a measure of the average prediction error of the model on unseen data.
mean(t_test$residual)
## [1] 133.1496
# Analysis
# Mean of the test residuals is 133.1496
# On average, the model's predictions for the sale prices in the test dataset are off by around $133.1496.
mean(t_train$residual)
## [1] 0.000000003154171
# These lines calculate the Root Mean Squared Error (RMSE) for both the training and test data set
# RMSE is a measure of the average prediction error of the model
rsme_test = sqrt(mean(t_test$residual ^ 2))
rsme_train = sqrt(mean(t_train$residual ^ 2))
# These lines generate a scatter plot comparing the predicted life expectancy values against the actual values for both the training and test data sets.
# The geom_abline represents perfect predictions. Blue points represent test data, and green points represent training data.
ggplot() +
geom_abline(intercept = 0, slope = 1, color = 'black') +
geom_point(data = t_test,
mapping = aes(x = predicted, y = sale_price),
color = 'blue') +
geom_point(data = t_train,
mapping = aes(x = predicted, y = sale_price),
color = 'green', alpha = 0.1)
# This graph shows that the test and train points are near the black diagonal line
# The model's predictions closely match the actual sale prices for both the training and test data sets
# Calculated R-squared for train
# Take the `sum` of the squared error
residual_sum_of_squares <- sum(t_train$residual ^ 2)
# Found the difference between the mean and every row
total_variation_in_model <- mean(t_train$sale_price) - t_train$sale_price
# Took the sum of the squared variation
total_sum_of_squares <- sum(total_variation_in_model^ 2)
# Calculate R^2 for 1 less residual divided by total sum
r2 <- 1 - (residual_sum_of_squares / total_sum_of_squares)
# Calculated R-squared for test
print(c('r-squared for train', r2))
## [1] "r-squared for train" "0.793824349783499"
# Took the `sum` of the squared error
residual_sum_of_squares2 <- sum(t_test$residual^ 2)
# Found the difference between the mean and every row
total_variation_in_model2 <- mean(t_test$sale_price) - t_test$sale_price
# Took the sum of the squared variation
total_sum_of_squares2 <- sum(total_variation_in_model2^ 2)
# Calculated R^2
r22 <- 1 - (residual_sum_of_squares2 / total_sum_of_squares2)
print(c('r-squared for test', r22))
## [1] "r-squared for test" "0.812278279784035"
# Analysis
# 79.38% of the variance in the sale prices of the training data is explained by the independent variables in my model
# 81.23% of the variance in the sale prices of the test data is explained by the model
# This suggests that the model performs slightly better on unseen data compared to the training data but it is still very close