# Loading necessary libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
# Loading the data from data set
NY_House_Dataset <- read.csv("C:\\Users\\velag\\Downloads\\NY-House-Dataset.csv")
# The response variable
response_variable <- "PRICE" # "PRICE" is the most valuable variable, representing property price
# The categorical explanatory variable
explanatory_variable <- "BEDS" # "BEDS" represents the number of bedrooms, which might influence property price
# Perform ANOVA test
anova_result <- aov(as.formula(paste(response_variable, "~", explanatory_variable)), data = NY_House_Dataset)
# Summarize ANOVA results
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## BEDS 1 1.285e+16 1.285e+16 13.11 0.000297 ***
## Residuals 4799 4.706e+18 9.807e+14
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Consolidate categories if there are more than 10
if (length(unique(NY_House_Dataset$BEDS)) > 10) {
NY_House_Dataset <- NY_House_Dataset %>%
mutate(BEDS = case_when(
BEDS %in% c(1, 2, 3) ~ "1-3",
BEDS %in% c(4, 5, 6) ~ "4-6",
TRUE ~ "7+"
))
}
# Perform ANOVA test again after consolidation
anova_result_consolidated <- aov(as.formula(paste(response_variable, "~", explanatory_variable)), data = NY_House_Dataset)
# Summarize consolidated ANOVA results
summary(anova_result_consolidated)
## Df Sum Sq Mean Sq F value Pr(>F)
## BEDS 2 3.387e+16 1.694e+16 17.34 3.12e-08 ***
## Residuals 4798 4.685e+18 9.765e+14
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Interpret ANOVA results
# The p-value associated with the explanatory variable indicates whether there is a significant difference
# in the response variable (PRICE) among different levels of the categorical variable (BEDS).
# If the p-value is less than the chosen significance level (e.g., 0.05), we reject the null hypothesis
# and we can conclude that there is a significant difference in property prices based on the number of bedrooms.
# A continuous variable that might influence the response variable (PRICE)
continuous_variable <- "PROPERTYSQFT" # property square footage might influence property price
# Build a linear regression model
linear_model <- lm(PRICE ~ PROPERTYSQFT, data = NY_House_Dataset)
# Evaluate the model fit
summary(linear_model)
##
## Call:
## lm(formula = PRICE ~ PROPERTYSQFT, data = NY_House_Dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -74220190 -1692697 -706940 142847 2133694874
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -837809.4 610850.8 -1.372 0.17
## PROPERTYSQFT 1462.7 189.2 7.729 1.31e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31170000 on 4799 degrees of freedom
## Multiple R-squared: 0.0123, Adjusted R-squared: 0.01209
## F-statistic: 59.74 on 1 and 4799 DF, p-value: 1.307e-14
# Residuals vs Fitted Plot
plot(residuals(linear_model), fitted(linear_model),
xlab = "Fitted values", ylab = "Residuals",
main = "Residuals vs Fitted")

# Normal Q-Q Plot
qqnorm(residuals(linear_model))
qqline(residuals(linear_model))

# Scale-Location Plot (Square root of standardized residuals vs. fitted values)
plot(sqrt(abs(rstandard(linear_model))) ~ fitted(linear_model),
xlab = "Fitted values", ylab = "Square root of standardized residuals",
main = "Scale-Location Plot")

# Residuals vs Leverage Plot
plot(hatvalues(linear_model), residuals(linear_model),
xlab = "Leverage", ylab = "Residuals",
main = "Residuals vs Leverage")

# Cook's Distance Plot
plot(cooks.distance(linear_model), pch = 20, cex = 1,
xlab = "Observation", ylab = "Cook's Distance",
main = "Cook's Distance Plot")
abline(h = 4 * mean(cooks.distance(linear_model)), col = "red", lty = 2)

# Interpret coefficients
# The coefficient for PROPERTYSQFT represents the change in the response variable (PRICE)
# for a one-unit increase in property square footage, holding all other variables constant.
# For example, if the coefficient is 100, it means that for every additional square foot of property space,
# the price increases by $100.
# Further investigation
# Further investigation could involve exploring additional variables that might influence property prices,
# such as location, amenities, or neighborhood characteristics.
# Additionally, assessing the presence of multidisciplinary and checking model assumptions
# (e.g., linearity, normality of residuals) could enhance the robustness of the linear regression model.