Step 0: Setup

Created my tibbles, loaded data from the file Loaded the file, and split into train and test tibbles.

Step 1: Histograms

Ran a histogram on all continuous variables of interest.

options(scipen = 99)
hist(t$overall_qual)

hist(t$gr_liv_area)

hist(t$garage_area)

hist(t$total_bsmt_sf)

hist(t$kitchen_qual)

Step 2: Correlation

Examined correlation + pairs. Made sure that I have linear relationships with no outliers.

library(ggcorrplot)

t_plot <- t %>%
  select(sale_price, overall_qual,gr_liv_area,garage_area,total_bsmt_sf, kitchen_qual)

ggcorrplot::ggcorrplot(cor(t_plot))

# Shows that all the independent variables picked are significantly impacting sale price

Step 3: Created my model

Made my lm model.

model <- lm(sale_price ~ overall_qual + gr_liv_area + garage_area + total_bsmt_sf + kitchen_qual, data = t_train)

#This summary shows every variable has significant positive relationship with sale price

summary(model)
## 
## Call:
## lm(formula = sale_price ~ overall_qual + gr_liv_area + garage_area + 
##     total_bsmt_sf + kitchen_qual, data = t_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -515898  -19152   -1498   16198  277810 
## 
## Coefficients:
##                  Estimate  Std. Error t value            Pr(>|t|)    
## (Intercept)   -137561.000    3993.693  -34.45 <0.0000000000000002 ***
## overall_qual    19600.457     816.367   24.01 <0.0000000000000002 ***
## gr_liv_area        45.582       1.805   25.25 <0.0000000000000002 ***
## garage_area        52.180       4.280   12.19 <0.0000000000000002 ***
## total_bsmt_sf      29.375       2.031   14.46 <0.0000000000000002 ***
## kitchen_qual    21362.092    1486.776   14.37 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36590 on 2583 degrees of freedom
## Multiple R-squared:  0.7938, Adjusted R-squared:  0.7934 
## F-statistic:  1989 on 5 and 2583 DF,  p-value: < 0.00000000000000022
# These lines calculate the predicted values and residuals for both the training and test data sets based on the fitted model

t_train <- t_train %>% 
  mutate(predicted = predict(model, newdata = t_train),
         residual = predicted - sale_price)

t_test <- t_test %>%
  mutate(predicted = predict(model, newdata = t_test),
         residual = predicted - sale_price)

# This line calculates the mean of the residuals for the test dataset. It provides a measure of the average prediction error of the model on unseen data.
mean(t_test$residual)
## [1] 133.1496
# Analysis 
# Mean of the test residuals is 133.1496 
# On average, the model's predictions for the sale prices in the test dataset are off by around $133.1496.

mean(t_train$residual)
## [1] 0.000000003154171
# These lines calculate the Root Mean Squared Error (RMSE) for both the training and test data set
# RMSE is a measure of the average prediction error of the model

rsme_test = sqrt(mean(t_test$residual ^ 2))
rsme_train = sqrt(mean(t_train$residual ^ 2))

# These lines generate a scatter plot comparing the predicted life expectancy values against the actual values for both the training and test data sets.
# The geom_abline represents perfect predictions. Blue points represent test data, and green points represent training data.

ggplot() +
  geom_abline(intercept = 0, slope = 1, color = 'black') +
  geom_point(data = t_test, 
             mapping = aes(x = predicted, y = sale_price),
             color = 'blue') +
  geom_point(data = t_train, 
             mapping = aes(x = predicted, y = sale_price),
             color = 'green', alpha = 0.1)

# This graph shows that the test and train points are near the black diagonal line
# The model's predictions closely match the actual sale prices for both the training and test data sets

Step 4: R-squared

# Calculated R-squared for train
# Take the `sum` of the squared error
residual_sum_of_squares <- sum(t_train$residual ^ 2)

# Found the difference between the mean and every row
total_variation_in_model <- mean(t_train$sale_price) - t_train$sale_price

# Took the sum of the squared variation
total_sum_of_squares <- sum(total_variation_in_model^ 2)

# Calculate R^2 for 1 less residual divided by total sum
r2 <- 1 - (residual_sum_of_squares / total_sum_of_squares)

# Calculated R-squared for test
print(c('r-squared for train', r2))
## [1] "r-squared for train" "0.793824349783499"
# Took the `sum` of the squared error
residual_sum_of_squares2 <- sum(t_test$residual^ 2)

# Found the difference between the mean and every row
total_variation_in_model2 <- mean(t_test$sale_price) - t_test$sale_price

# Took the sum of the squared variation
total_sum_of_squares2 <- sum(total_variation_in_model2^ 2)

# Calculated R^2
r22 <- 1 - (residual_sum_of_squares2 / total_sum_of_squares2)

print(c('r-squared for test', r22))
## [1] "r-squared for test" "0.812278279784035"
# Analysis 
# 79.38% of the variance in the sale prices of the training data is explained by the independent variables in my model
# 81.23% of the variance in the sale prices of the test data is explained by the model
# This suggests that the model performs slightly better on unseen data compared to the training data but it is still very close