Data Dive — Regression Modeling

# Loading necessary libraries
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(tidyr)

# Loading the data from data set
NY_House_Dataset <- read.csv("C:\\Users\\velag\\Downloads\\NY-House-Dataset.csv")

# The response variable
response_variable <- "PRICE"  # "PRICE" is the most valuable variable, representing property price

# The categorical explanatory variable
explanatory_variable <- "BEDS"  # "BEDS" represents the number of bedrooms, which might influence property price

# Perform ANOVA test
anova_result <- aov(as.formula(paste(response_variable, "~", explanatory_variable)), data = NY_House_Dataset)

# Summarize ANOVA results
summary(anova_result)

##               Df    Sum Sq   Mean Sq F value   Pr(>F)    
## BEDS           1 1.285e+16 1.285e+16   13.11 0.000297 ***
## Residuals   4799 4.706e+18 9.807e+14                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Consolidate categories if there are more than 10
if (length(unique(NY_House_Dataset$BEDS)) > 10) {
  NY_House_Dataset <- NY_House_Dataset %>%
    mutate(BEDS = case_when(
      BEDS %in% c(1, 2, 3) ~ "1-3",
      BEDS %in% c(4, 5, 6) ~ "4-6",
      TRUE ~ "7+"
    ))
}

# Perform ANOVA test again after consolidation
anova_result_consolidated <- aov(as.formula(paste(response_variable, "~", explanatory_variable)), data = NY_House_Dataset)

# Summarize consolidated ANOVA results
summary(anova_result_consolidated)

##               Df    Sum Sq   Mean Sq F value   Pr(>F)    
## BEDS           2 3.387e+16 1.694e+16   17.34 3.12e-08 ***
## Residuals   4798 4.685e+18 9.765e+14                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Interpret ANOVA results
# The p-value associated with the explanatory variable indicates whether there is a significant difference 
# in the response variable (PRICE) among different levels of the categorical variable (BEDS).
# If the p-value is less than the chosen significance level (e.g., 0.05), we reject the null hypothesis 
# and we can conclude that there is a significant difference in property prices based on the number of bedrooms.

# A continuous variable that might influence the response variable (PRICE)
continuous_variable <- "PROPERTYSQFT"  # property square footage might influence property price

# Build a linear regression model
linear_model <- lm(PRICE ~ PROPERTYSQFT, data = NY_House_Dataset)

# Evaluate the model fit
summary(linear_model)

## 
## Call:
## lm(formula = PRICE ~ PROPERTYSQFT, data = NY_House_Dataset)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
##  -74220190   -1692697    -706940     142847 2133694874 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -837809.4   610850.8  -1.372     0.17    
## PROPERTYSQFT    1462.7      189.2   7.729 1.31e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 31170000 on 4799 degrees of freedom
## Multiple R-squared:  0.0123, Adjusted R-squared:  0.01209 
## F-statistic: 59.74 on 1 and 4799 DF,  p-value: 1.307e-14

# Residuals vs Fitted Plot
plot(residuals(linear_model), fitted(linear_model),
     xlab = "Fitted values", ylab = "Residuals",
     main = "Residuals vs Fitted")

# Normal Q-Q Plot
qqnorm(residuals(linear_model))
qqline(residuals(linear_model))

# Scale-Location Plot (Square root of standardized residuals vs. fitted values)
plot(sqrt(abs(rstandard(linear_model))) ~ fitted(linear_model),
     xlab = "Fitted values", ylab = "Square root of standardized residuals",
     main = "Scale-Location Plot")

# Residuals vs Leverage Plot
plot(hatvalues(linear_model), residuals(linear_model),
     xlab = "Leverage", ylab = "Residuals",
     main = "Residuals vs Leverage")

# Cook's Distance Plot
plot(cooks.distance(linear_model), pch = 20, cex = 1,
     xlab = "Observation", ylab = "Cook's Distance",
     main = "Cook's Distance Plot")
abline(h = 4 * mean(cooks.distance(linear_model)), col = "red", lty = 2)

# Interpret coefficients
# The coefficient for PROPERTYSQFT represents the change in the response variable (PRICE) 
# for a one-unit increase in property square footage, holding all other variables constant.
# For example, if the coefficient is 100, it means that for every additional square foot of property space, 
# the price increases by $100.

# Further investigation
# Further investigation could involve exploring additional variables that might influence property prices, 
# such as location, amenities, or neighborhood characteristics.
# Additionally, assessing the presence of multidisciplinary and checking model assumptions 
# (e.g., linearity, normality of residuals) could enhance the robustness of the linear regression model.

Data Dive — Regression Modeling

Abhinandhan Velagapudi

2024-03-03