1. Exploratory Data Analysis (EDA)
# Load libraries
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.2
## Warning: package 'purrr' was built under R version 4.4.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Read the dataset
city_housing <- read.csv("E:\\SLTC\\ICE\\3rd Year\\6th Sem\\Data Analytics\\Assignments\\CityHousing.csv")

# Display basic structure and summary

str(city_housing) # Provides a concise summary of the structure
## 'data.frame':    500 obs. of  5 variables:
##  $ HousePrice          : num  872756 389351 536830 428950 475166 ...
##  $ SquareFootage       : int  3974 1660 2094 1930 1895 3892 2438 2969 1266 2038 ...
##  $ Bedrooms            : int  1 5 5 1 5 5 5 3 4 2 ...
##  $ DistanceToCityCenter: num  4.83 14.01 2.67 3.64 1.05 ...
##  $ NeighborhoodIncome  : int  67082 69241 84569 51745 66029 53025 54288 46540 108878 81846 ...
summary(city_housing) # Provide the summery
##    HousePrice     SquareFootage     Bedrooms     DistanceToCityCenter
##  Min.   :182493   Min.   : 801   Min.   :1.000   Min.   : 1.000      
##  1st Qu.:421807   1st Qu.:1658   1st Qu.:2.000   1st Qu.: 5.992      
##  Median :576614   Median :2440   Median :3.000   Median :10.955      
##  Mean   :577984   Mean   :2435   Mean   :3.036   Mean   :10.737      
##  3rd Qu.:727444   3rd Qu.:3232   3rd Qu.:4.000   3rd Qu.:15.568      
##  Max.   :949989   Max.   :3991   Max.   :5.000   Max.   :19.990      
##  NeighborhoodIncome
##  Min.   : 40281    
##  1st Qu.: 62440    
##  Median : 78693    
##  Mean   : 80134    
##  3rd Qu.:100373    
##  Max.   :119973
head(city_housing) # Provide the first few rows of the data set
##   HousePrice SquareFootage Bedrooms DistanceToCityCenter NeighborhoodIncome
## 1   872756.1          3974        1                 4.83              67082
## 2   389350.6          1660        5                14.01              69241
## 3   536829.7          2094        5                 2.67              84569
## 4   428949.8          1930        1                 3.64              51745
## 5   475165.5          1895        5                 1.05              66029
## 6   926564.5          3892        5                 3.22              53025
# Histogram
num_vars <- city_housing %>% select_if(is.numeric)
num_vars %>% 
  gather() %>% 
  ggplot(aes(value)) + # took this line from chatgpt
  geom_histogram(bins = 15, fill = "red", color = "black") +
  facet_wrap(~key, scales = "free", ncol = 2) +
  labs(title = "Distributions of Variables")

# Check for missing values
missing_values <- colSums(is.na(city_housing))
missing_values
##           HousePrice        SquareFootage             Bedrooms 
##                    0                    0                    0 
## DistanceToCityCenter   NeighborhoodIncome 
##                    0                    0
# Visualize missing data (if needed)
library(Amelia) # Install using install.packages("Amelia")
## Warning: package 'Amelia' was built under R version 4.4.2
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.3, built: 2024-11-07)
## ## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(city_housing, main = "Missing Values Map", col = c("yellow", "black"), legend = FALSE)

As we can see here there is no any missing values in each column

# Boxplots for outlier detection
num_vars %>% 
  gather() %>% 
  ggplot(aes(x = key, y = value)) +
  geom_boxplot(fill = "red", color = "black") +
  coord_flip() +
  labs(title = "Outlier Detection")

Manage the outliers using the Interquartile Range (IQR) method

# Identify potential outliers using IQR
#took this part from google
handle_outliers <- function(data) {
  Q1 <- quantile(data, 0.25, na.rm = TRUE)
  Q3 <- quantile(data, 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  data[data < lower_bound | data > upper_bound]
}

outliers <- lapply(num_vars, handle_outliers)
outliers
## $HousePrice
## numeric(0)
## 
## $SquareFootage
## integer(0)
## 
## $Bedrooms
## integer(0)
## 
## $DistanceToCityCenter
## numeric(0)
## 
## $NeighborhoodIncome
## integer(0)
# Calculate correlations
cor_matrix <- cor(city_housing %>% select_if(is.numeric), use = "complete.obs")

# Convert the correlation matrix to a tidy data frame
cor_df <- as.data.frame(as.table(cor_matrix)) # Converts matrix to a data frame
colnames(cor_df) <- c("Var1", "Var2", "value") # Rename columns

# Visualize with a heatmap
ggplot(cor_df, aes(x = Var1, y = Var2, fill = value)) + 
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +
  labs(title = "Correlation Heatmap with Color Squares", fill = "Correlation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

SquareFootage is the strongest predictor of HousePrice. Larger houses are more expensive, which is intuitive.

Other factors like the number of bedrooms, distance to the city center, and neighborhood income have little to no direct relationship with house prices in this dataset.

Build the Regression Model

# Load required library
library(tidyverse)

# Fit a linear regression model
lm_model <- lm(HousePrice ~ ., data = city_housing)

# Display the summary of the model
summary(lm_model)
## 
## Call:
## lm(formula = HousePrice ~ ., data = city_housing)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -89174 -20301   -650  19729  93314 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           4.911e+04  7.216e+03   6.805 2.92e-11 ***
## SquareFootage         1.995e+02  1.470e+00 135.661  < 2e-16 ***
## Bedrooms              1.094e+04  9.214e+02  11.877  < 2e-16 ***
## DistanceToCityCenter -1.201e+03  2.434e+02  -4.935 1.10e-06 ***
## NeighborhoodIncome    2.838e-01  5.862e-02   4.842 1.72e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 29930 on 495 degrees of freedom
## Multiple R-squared:  0.9739, Adjusted R-squared:  0.9737 
## F-statistic:  4616 on 4 and 495 DF,  p-value: < 2.2e-16
# Scatter plot with regression line (for one predictor, e.g., SquareFootage)
ggplot(city_housing, aes(x = SquareFootage, y = HousePrice)) +
  geom_point(color = "blue", alpha = 0.6) + # Scatter points
  geom_smooth(method = "lm", se = TRUE, color = "red") + # Regression line
  labs(title = "House Prices vs Square Footage",
       x = "Square Footage",
       y = "House Price") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

The inclusion/exclusion of variables in the model using appropriate statistical metrics.

SquareFootage
P-value = < 2e-16
Variance Inflation Factor = Low Decision = Include
Justification = Highly significant and intuitive predictor of house price.

Bedrooms
P-value = < 2e-16
Variance Inflation Factor = Moderate
Justification = Include Statistically significant; relevant for pricing larger homes.

DistanceToCityCenter
P-value = 1.10e-06
Variance Inflation Factor = Low Justification = Include Significant and explains location-based price variations.

NeighborhoodIncome
P-value = 1.72e-06
Variance Inflation Factor = Low Justification = Include Statistically significant; indicates socio-economic factors.

  1. Interpret the Results
# R-squared
rsq <- summary(lm_model)$r.squared

# Adjusted R-squared
adj_rsq <- summary(lm_model)$adj.r.squared

# F-statistic and its p-value
f_statistic <- summary(lm_model)$fstatistic[1]
f_p_value <- pf(f_statistic, df1 = summary(lm_model)$fstatistic[2], df2 = summary(lm_model)$fstatistic[3], lower.tail = FALSE)

# Coefficients
coefficients <- summary(lm_model)$coefficients

# Print the key metrics
cat("R-squared:", rsq, "\n")
## R-squared: 0.9738901
cat("Adjusted R-squared:", adj_rsq, "\n")
## Adjusted R-squared: 0.9736791
cat("F-statistic:", f_statistic, "\n")
## F-statistic: 4615.838
cat("F-statistic p-value:", f_p_value, "\n")
## F-statistic p-value: 0
cat("Coefficients:\n")
## Coefficients:
print(coefficients)
##                           Estimate   Std. Error    t value     Pr(>|t|)
## (Intercept)          49108.2020882 7215.9663508   6.805492 2.919776e-11
## SquareFootage          199.4835183    1.4704513 135.661426 0.000000e+00
## Bedrooms             10944.2143146  921.4483531  11.877187 8.436721e-29
## DistanceToCityCenter -1200.8646492  243.3608839  -4.934502 1.099539e-06
## NeighborhoodIncome       0.2838393    0.0586161   4.842344 1.718264e-06

(Intercept) - 49,110
Base price of a house when all predictors are zero. It represents the starting point of the house price when all other features are zero. This value is often not meaningful in a real-world context since houses with zero square footage or bedrooms do not exist.

SquareFootage - 199.5
For each additional square foot of house size, the house price increases by 199.5 units. This is the strongest predictor, meaning larger houses tend to be more expensive.

Bedrooms - 10,940
For each additional bedroom, the house price increases by 10,940 units. This suggests that more bedrooms generally make a house more expensive, though the effect is less strong than SquareFootage.

DistanceToCityCenter - 1,201
For each unit increase in the distance from the city center, the house price decreases by 1,201 units. This makes sense, as houses further from the city center tend to be less expensive.

NeighborhoodIncome - 0.2838 For each additional unit increase in the neighborhood income, the house price increases by 0.2838 units. This suggests that houses in wealthier neighborhoods tend to have higher prices.

# Install and load the car package if not already installed
#took this from ChatGPT
library(car)
## Warning: package 'car' was built under R version 4.4.2
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
# Calculate the VIF for each predictor in the model
vif_values <- vif(lm_model)

# Print the VIF values
print(vif_values)
##        SquareFootage             Bedrooms DistanceToCityCenter 
##             1.004008             1.002132             1.000719 
##   NeighborhoodIncome 
##             1.003306

There is no clear indication of problematic multicollinearity in this model

  1. Validate Model Assumptions
# Residual plot
plot(lm_model$fitted.values, residuals(lm_model), 
     xlab = "Fitted Values", ylab = "Residuals", 
     main = "Residuals vs Fitted")
abline(h = 0, col = "red", lwd = 2)

# Q-Q plot
qqnorm(residuals(lm_model))
qqline(residuals(lm_model), col = "red", lwd = 2)

# Residuals vs Fitted (again)3
#took from google
plot(lm_model$fitted.values, residuals(lm_model), 
     xlab = "Fitted Values", ylab = "Residuals", 
     main = "Residuals vs Fitted")
abline(h = 0, col = "red", lwd = 2)

# Breusch-Pagan test
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.4.2
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
bptest(lm_model)
## 
##  studentized Breusch-Pagan test
## 
## data:  lm_model
## BP = 4.3667, df = 4, p-value = 0.3586
# Durbin-Watson test
library(lmtest)
dwtest(lm_model)
## 
##  Durbin-Watson test
## 
## data:  lm_model
## DW = 1.9007, p-value = 0.1338
## alternative hypothesis: true autocorrelation is greater than 0