## 1
# Load necessary libraries
library(dplyr) # For data manipulation
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2) # For visualization
# Load the dataset
housing_data <- read.csv("pacdev_data.csv")
# Fit a simple linear regression model
model <- lm(price ~ sqft, data = housing_data)
# Display the coefficients
summary(model)
##
## Call:
## lm(formula = price ~ sqft, data = housing_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -622948 -151283 -1650 138951 804553
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40623.019 15862.454 2.561 0.0105 *
## sqft 269.345 7.742 34.791 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 210300 on 4989 degrees of freedom
## Multiple R-squared: 0.1953, Adjusted R-squared: 0.1951
## F-statistic: 1210 on 1 and 4989 DF, p-value: < 2.2e-16
## 2
# Load necessary libraries
library(dplyr) # For data manipulation
library(ggplot2) # For visualization
# Load the dataset
housing_data <- read.csv("pacdev_data.csv")
# Create a new centered variable in the dataset
housing_data$centered_sqft <- housing_data$sqft - mean(housing_data$sqft)
# Fit the model using the newly created centered variable
model_centered <- lm(price ~ centered_sqft, data = housing_data)
# Display coefficients for the centered model
summary(model_centered)
##
## Call:
## lm(formula = price ~ centered_sqft, data = housing_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -622948 -151283 -1650 138951 804553
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.827e+05 2.976e+03 195.77 <2e-16 ***
## centered_sqft 2.693e+02 7.742e+00 34.79 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 210300 on 4989 degrees of freedom
## Multiple R-squared: 0.1953, Adjusted R-squared: 0.1951
## F-statistic: 1210 on 1 and 4989 DF, p-value: < 2.2e-16
# Load necessary libraries
library(dplyr) # For data manipulation
# Load the dataset
housing_data <- read.csv("pacdev_data.csv")
# Centering numeric predictors (assuming 'bed' and 'bath' are numeric)
housing_data$centered_bed <- scale(housing_data$bed, center = TRUE, scale = FALSE)
housing_data$centered_bath <- scale(housing_data$bath, center = TRUE, scale = FALSE)
housing_data$centered_sqft<- scale(housing_data$sqft, center = TRUE, scale = FALSE)
# Assuming 'garage' is a factor/categorical variable - create dummy variables
housing_data <- cbind(housing_data, model.matrix(~ garage - 1, data = housing_data))
# Renaming the dummy columns for better readability
colnames(housing_data)[(ncol(housing_data) - 1):ncol(housing_data)] <- c("no_garage", "yes_garage")
# Fit the multiple regression model with all predictors (including centered and dummy variables)
model_multiple <- lm(price ~ centered_sqft + centered_bed + centered_bath + no_garage + yes_garage + pool + city, data = as.data.frame(housing_data))
# Display coefficients
summary(model_multiple)
##
## Call:
## lm(formula = price ~ centered_sqft + centered_bed + centered_bath +
## no_garage + yes_garage + pool + city, data = as.data.frame(housing_data))
##
## Residuals:
## Min 1Q Median 3Q Max
## -539286 -137407 -3532 124838 852187
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 507404.206 4502.633 112.691 <2e-16 ***
## centered_sqft 271.561 7.515 36.138 <2e-16 ***
## centered_bed 41.553 8420.185 0.005 0.9961
## centered_bath -3092.909 9439.900 -0.328 0.7432
## no_garage 14195.911 6120.799 2.319 0.0204 *
## yes_garage NA NA NA NA
## poolyes 10124.630 6760.090 1.498 0.1343
## citySanta Monica 190239.704 6757.751 28.151 <2e-16 ***
## cityWestwood 88020.719 6794.984 12.954 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 195000 on 4983 degrees of freedom
## Multiple R-squared: 0.3085, Adjusted R-squared: 0.3075
## F-statistic: 317.6 on 7 and 4983 DF, p-value: < 2.2e-16
# Adding the interaction term between centered_sqft and city to the existing model
model_interaction <- lm(price ~ centered_sqft * city + centered_bed + centered_bath + no_garage + yes_garage + pool, data = as.data.frame(housing_data))
# Display coefficients for the interaction model
summary(model_interaction)
##
## Call:
## lm(formula = price ~ centered_sqft * city + centered_bed + centered_bath +
## no_garage + yes_garage + pool, data = as.data.frame(housing_data))
##
## Residuals:
## Min 1Q Median 3Q Max
## -536726 -138828 -2505 124312 868332
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 507303.45 4491.69 112.943 < 2e-16 ***
## centered_sqft 240.63 10.14 23.726 < 2e-16 ***
## citySanta Monica 189904.29 6741.32 28.170 < 2e-16 ***
## cityWestwood 88037.11 6778.98 12.987 < 2e-16 ***
## centered_bed 235.26 8403.39 0.028 0.9777
## centered_bath -3099.26 9419.64 -0.329 0.7422
## no_garage 14205.98 6108.49 2.326 0.0201 *
## yes_garage NA NA NA NA
## poolyes 10817.17 6744.71 1.604 0.1088
## centered_sqft:citySanta Monica 90.09 17.49 5.152 2.68e-07 ***
## centered_sqft:cityWestwood 37.95 18.05 2.103 0.0356 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 194500 on 4981 degrees of freedom
## Multiple R-squared: 0.3122, Adjusted R-squared: 0.311
## F-statistic: 251.2 on 9 and 4981 DF, p-value: < 2.2e-16
# Create a visualization of the interaction between sqft and city
library(ggplot2)
ggplot(housing_data, aes(x = centered_sqft, y = price, color = city)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x) +
labs(title = "Interaction between sqft and city", x = "Centered Sqft", y = "Price")
# Generate the residual plot for the interaction model
plot(model_interaction, which = 1) # Plot of model residuals
## Based on the produced graph above of fitted values versus residuals,
this is a good model. There is seemingly no pattern between these two
values which implies no correlation or relationship. This also suggests
that the model is capturing the variability in the dataset.
## Based on the regression model it appears that the primary factor affecting housing prices is 'sqft'. It is not the only impactful variable as the other predictors have positive effects on housing prices as well (or in other words are positively correlated, so increasing the predictor increases price). The model does show that there is a varied effect of the impact of sqft on housing prices in various cities. This means that there is a varying relationship across residential areas and that the model is capturing this relationship.
## From the city relationship plot, it appears that Santa Monica has the strongest positive relationship between sqft and housing prices. This is explained by the steeper slope in that graph, along with the line of best fit being significantly higher at its intercept than the other best fit lines.
## The model itself, per Q4, also shows that there is a significant, and non-influenced, impact of these predictors on housing prices. The plot showing residuals randomly scattered against fitted values confirms this as well.
## I would recommend that Andrew increase the price per sqft in Santa Monica because of the stronger positive relationship between sqft and housing remodel prices. It would be prudent that it be the city where prices are raised because it will maximize profits for the company as this clientele base is likely willing to pay more as they are exposed to these prices more often. If Andrew were to recommend the higher prices per sqft in Westwood it could have a negative effect on the number of customers who go through with a remodel, or push them to competitors. In order to stay competitive I would recommend he increase prices in Santa Monica per sqft.