# Load libraries
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.2
## Warning: package 'purrr' was built under R version 4.4.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Read the dataset
city_housing <- read.csv("E:\\SLTC\\ICE\\3rd Year\\6th Sem\\Data Analytics\\Assignments\\CityHousing.csv")
# Display basic structure and summary
str(city_housing) # Provides a concise summary of the structure
## 'data.frame': 500 obs. of 5 variables:
## $ HousePrice : num 872756 389351 536830 428950 475166 ...
## $ SquareFootage : int 3974 1660 2094 1930 1895 3892 2438 2969 1266 2038 ...
## $ Bedrooms : int 1 5 5 1 5 5 5 3 4 2 ...
## $ DistanceToCityCenter: num 4.83 14.01 2.67 3.64 1.05 ...
## $ NeighborhoodIncome : int 67082 69241 84569 51745 66029 53025 54288 46540 108878 81846 ...
summary(city_housing) # Provide the summery
## HousePrice SquareFootage Bedrooms DistanceToCityCenter
## Min. :182493 Min. : 801 Min. :1.000 Min. : 1.000
## 1st Qu.:421807 1st Qu.:1658 1st Qu.:2.000 1st Qu.: 5.992
## Median :576614 Median :2440 Median :3.000 Median :10.955
## Mean :577984 Mean :2435 Mean :3.036 Mean :10.737
## 3rd Qu.:727444 3rd Qu.:3232 3rd Qu.:4.000 3rd Qu.:15.568
## Max. :949989 Max. :3991 Max. :5.000 Max. :19.990
## NeighborhoodIncome
## Min. : 40281
## 1st Qu.: 62440
## Median : 78693
## Mean : 80134
## 3rd Qu.:100373
## Max. :119973
head(city_housing) # Provide the first few rows of the data set
## HousePrice SquareFootage Bedrooms DistanceToCityCenter NeighborhoodIncome
## 1 872756.1 3974 1 4.83 67082
## 2 389350.6 1660 5 14.01 69241
## 3 536829.7 2094 5 2.67 84569
## 4 428949.8 1930 1 3.64 51745
## 5 475165.5 1895 5 1.05 66029
## 6 926564.5 3892 5 3.22 53025
# Histogram
num_vars <- city_housing %>% select_if(is.numeric)
num_vars %>%
gather() %>%
ggplot(aes(value)) + # took this line from chatgpt
geom_histogram(bins = 15, fill = "red", color = "black") +
facet_wrap(~key, scales = "free", ncol = 2) +
labs(title = "Distributions of Variables")
# Check for missing values
missing_values <- colSums(is.na(city_housing))
missing_values
## HousePrice SquareFootage Bedrooms
## 0 0 0
## DistanceToCityCenter NeighborhoodIncome
## 0 0
# Visualize missing data (if needed)
library(Amelia) # Install using install.packages("Amelia")
## Warning: package 'Amelia' was built under R version 4.4.2
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.3, built: 2024-11-07)
## ## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(city_housing, main = "Missing Values Map", col = c("yellow", "black"), legend = FALSE)
As we can see here there is no any missing values in each column
# Boxplots for outlier detection
num_vars %>%
gather() %>%
ggplot(aes(x = key, y = value)) +
geom_boxplot(fill = "red", color = "black") +
coord_flip() +
labs(title = "Outlier Detection")
Manage the outliers using the Interquartile Range (IQR) method
# Identify potential outliers using IQR
#took this part from google
handle_outliers <- function(data) {
Q1 <- quantile(data, 0.25, na.rm = TRUE)
Q3 <- quantile(data, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
data[data < lower_bound | data > upper_bound]
}
outliers <- lapply(num_vars, handle_outliers)
outliers
## $HousePrice
## numeric(0)
##
## $SquareFootage
## integer(0)
##
## $Bedrooms
## integer(0)
##
## $DistanceToCityCenter
## numeric(0)
##
## $NeighborhoodIncome
## integer(0)
# Calculate correlations
cor_matrix <- cor(city_housing %>% select_if(is.numeric), use = "complete.obs")
# Convert the correlation matrix to a tidy data frame
cor_df <- as.data.frame(as.table(cor_matrix)) # Converts matrix to a data frame
colnames(cor_df) <- c("Var1", "Var2", "value") # Rename columns
# Visualize with a heatmap
ggplot(cor_df, aes(x = Var1, y = Var2, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +
labs(title = "Correlation Heatmap with Color Squares", fill = "Correlation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
SquareFootage is the strongest predictor of HousePrice. Larger houses
are more expensive, which is intuitive.
Other factors like the number of bedrooms, distance to the city center, and neighborhood income have little to no direct relationship with house prices in this dataset.
Build the Regression Model
# Load required library
library(tidyverse)
# Fit a linear regression model
lm_model <- lm(HousePrice ~ ., data = city_housing)
# Display the summary of the model
summary(lm_model)
##
## Call:
## lm(formula = HousePrice ~ ., data = city_housing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -89174 -20301 -650 19729 93314
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.911e+04 7.216e+03 6.805 2.92e-11 ***
## SquareFootage 1.995e+02 1.470e+00 135.661 < 2e-16 ***
## Bedrooms 1.094e+04 9.214e+02 11.877 < 2e-16 ***
## DistanceToCityCenter -1.201e+03 2.434e+02 -4.935 1.10e-06 ***
## NeighborhoodIncome 2.838e-01 5.862e-02 4.842 1.72e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 29930 on 495 degrees of freedom
## Multiple R-squared: 0.9739, Adjusted R-squared: 0.9737
## F-statistic: 4616 on 4 and 495 DF, p-value: < 2.2e-16
# Scatter plot with regression line (for one predictor, e.g., SquareFootage)
ggplot(city_housing, aes(x = SquareFootage, y = HousePrice)) +
geom_point(color = "blue", alpha = 0.6) + # Scatter points
geom_smooth(method = "lm", se = TRUE, color = "red") + # Regression line
labs(title = "House Prices vs Square Footage",
x = "Square Footage",
y = "House Price") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
The inclusion/exclusion of variables in the model using appropriate
statistical metrics.
SquareFootage
P-value = < 2e-16
Variance Inflation Factor = Low Decision = Include
Justification = Highly significant and intuitive predictor of house
price.
Bedrooms
P-value = < 2e-16
Variance Inflation Factor = Moderate
Justification = Include Statistically significant; relevant for pricing
larger homes.
DistanceToCityCenter
P-value = 1.10e-06
Variance Inflation Factor = Low Justification = Include Significant and
explains location-based price variations.
NeighborhoodIncome
P-value = 1.72e-06
Variance Inflation Factor = Low Justification = Include Statistically
significant; indicates socio-economic factors.
# R-squared
rsq <- summary(lm_model)$r.squared
# Adjusted R-squared
adj_rsq <- summary(lm_model)$adj.r.squared
# F-statistic and its p-value
f_statistic <- summary(lm_model)$fstatistic[1]
f_p_value <- pf(f_statistic, df1 = summary(lm_model)$fstatistic[2], df2 = summary(lm_model)$fstatistic[3], lower.tail = FALSE)
# Coefficients
coefficients <- summary(lm_model)$coefficients
# Print the key metrics
cat("R-squared:", rsq, "\n")
## R-squared: 0.9738901
cat("Adjusted R-squared:", adj_rsq, "\n")
## Adjusted R-squared: 0.9736791
cat("F-statistic:", f_statistic, "\n")
## F-statistic: 4615.838
cat("F-statistic p-value:", f_p_value, "\n")
## F-statistic p-value: 0
cat("Coefficients:\n")
## Coefficients:
print(coefficients)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49108.2020882 7215.9663508 6.805492 2.919776e-11
## SquareFootage 199.4835183 1.4704513 135.661426 0.000000e+00
## Bedrooms 10944.2143146 921.4483531 11.877187 8.436721e-29
## DistanceToCityCenter -1200.8646492 243.3608839 -4.934502 1.099539e-06
## NeighborhoodIncome 0.2838393 0.0586161 4.842344 1.718264e-06
(Intercept) - 49,110
Base price of a house when all predictors are zero. It represents the
starting point of the house price when all other features are zero. This
value is often not meaningful in a real-world context since houses with
zero square footage or bedrooms do not exist.
SquareFootage - 199.5
For each additional square foot of house size, the house price increases
by 199.5 units. This is the strongest predictor, meaning larger houses
tend to be more expensive.
Bedrooms - 10,940
For each additional bedroom, the house price increases by 10,940 units.
This suggests that more bedrooms generally make a house more expensive,
though the effect is less strong than SquareFootage.
DistanceToCityCenter - 1,201
For each unit increase in the distance from the city center, the house
price decreases by 1,201 units. This makes sense, as houses further from
the city center tend to be less expensive.
NeighborhoodIncome - 0.2838 For each additional unit increase in the neighborhood income, the house price increases by 0.2838 units. This suggests that houses in wealthier neighborhoods tend to have higher prices.
# Install and load the car package if not already installed
#took this from ChatGPT
library(car)
## Warning: package 'car' was built under R version 4.4.2
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
# Calculate the VIF for each predictor in the model
vif_values <- vif(lm_model)
# Print the VIF values
print(vif_values)
## SquareFootage Bedrooms DistanceToCityCenter
## 1.004008 1.002132 1.000719
## NeighborhoodIncome
## 1.003306
There is no clear indication of problematic multicollinearity in this model
# Residual plot
plot(lm_model$fitted.values, residuals(lm_model),
xlab = "Fitted Values", ylab = "Residuals",
main = "Residuals vs Fitted")
abline(h = 0, col = "red", lwd = 2)
# Q-Q plot
qqnorm(residuals(lm_model))
qqline(residuals(lm_model), col = "red", lwd = 2)
# Residuals vs Fitted (again)3
#took from google
plot(lm_model$fitted.values, residuals(lm_model),
xlab = "Fitted Values", ylab = "Residuals",
main = "Residuals vs Fitted")
abline(h = 0, col = "red", lwd = 2)
# Breusch-Pagan test
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.4.2
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
bptest(lm_model)
##
## studentized Breusch-Pagan test
##
## data: lm_model
## BP = 4.3667, df = 4, p-value = 0.3586
# Durbin-Watson test
library(lmtest)
dwtest(lm_model)
##
## Durbin-Watson test
##
## data: lm_model
## DW = 1.9007, p-value = 0.1338
## alternative hypothesis: true autocorrelation is greater than 0