rf <- read.csv("redfin_health_March2024.csv")
rf <- clean_names(rf)
# EDA with skimr and DataExplorer
rf_subset <- rf %>%
select_if(is.numeric)
rf_subset_house <- rf_subset[2:10]
rf_subset_house_NA <- na.omit(rf_subset_house)
plot_missing(rf_subset_house)
plot_histogram(rf_subset_house)
plot_boxplot(rf_subset_house, by="price")
## Warning: Removed 7618 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
plot_correlation(rf_subset_house_NA)
plot_density(rf_subset_house)
rf_house <- rf |> dplyr::select(price, beds, baths, lot_size, x_square_feet)
# Removing outliers for Price
variable <- rf_house$price
variable <- na.omit(variable)
# Calculate the IQR
Q1 <- quantile(variable, 0.25)
Q3 <- quantile(variable, 0.75)
IQR <- Q3 - Q1
# Define the upper and lower bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
lower_bound <- as.integer(lower_bound)
upper_bound <- as.integer(upper_bound)
rf_house <- rf_house |> filter(price >= lower_bound & price <= upper_bound)
# removing outliers for x_square_feet
variable <- rf_house$x_square_feet
variable <- na.omit(variable)
# Calculate the IQR
Q1 <- quantile(variable, 0.25)
Q3 <- quantile(variable, 0.75)
IQR <- Q3 - Q1
# Define the upper and lower bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
lower_bound <- as.integer(lower_bound)
upper_bound <- as.integer(upper_bound)
rf_house <- rf_house |> filter(x_square_feet >= lower_bound & x_square_feet <= upper_bound)
# Removing outliers for number of baths
variable <- rf_house$baths
variable <- na.omit(variable)
# Calculate the IQR
Q1 <- quantile(variable, 0.25)
Q3 <- quantile(variable, 0.75)
IQR <- Q3 - Q1
# Define the upper and lower bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
lower_bound <- as.integer(lower_bound)
upper_bound <- as.integer(upper_bound)
rf_house <- rf_house |> filter(baths >= lower_bound & baths <= upper_bound)
# Removing outliers for number of beds
variable <- rf_house$beds
variable <- na.omit(variable)
# Calculate the IQR
Q1 <- quantile(variable, 0.25)
Q3 <- quantile(variable, 0.75)
IQR <- Q3 - Q1
# Define the upper and lower bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
lower_bound <- as.integer(lower_bound)
upper_bound <- as.integer(upper_bound)
rf_house <- rf_house |> filter(beds >= lower_bound & beds <= upper_bound)
# Removing outliers for lot size
variable <- rf_house$lot_size
variable <- na.omit(variable)
# Calculate the IQR
Q1 <- quantile(variable, 0.25)
Q3 <- quantile(variable, 0.75)
IQR <- Q3 - Q1
# Define the upper and lower bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
lower_bound <- as.integer(lower_bound)
upper_bound <- as.integer(upper_bound)
rf_house <- rf_house |> filter(lot_size >= lower_bound & lot_size <= upper_bound)
Below a correlation matrix can be viewed for the selected numerical variables related to housing. I also generated a correlation table to view the results in another format. We can see the highest correlations relating to price are R values of .547 for baths and .661 for x_square_feet. I selected these variables for the later regression models. As they appear to having the largest influence on the price a home will sell for. Something I found surprising is the number of beds in a house wasn’t more highly correlated with houses sales price. It had a correlation of 0.187. This was also the case for lot size at an R value 0.237. This can make sense because housing prices in more urban areas will tend to sell for a lot more than rural areas with larger lot sizes. I aimed to select variables in this portion that would have the greatest impact on housing prices and avoided including ones with very minimal correlation.
plot_correlation(rf_house)
correlation_table <- cor(rf_house)
correlation_table
## price beds baths lot_size x_square_feet
## price 1.0000000 0.1872136 0.54786430 0.2372921 0.66096106
## beds 0.1872136 1.0000000 0.50967037 -0.0784709 -0.24563931
## baths 0.5478643 0.5096704 1.00000000 0.1238801 0.07168746
## lot_size 0.2372921 -0.0784709 0.12388006 1.0000000 0.15664436
## x_square_feet 0.6609611 -0.2456393 0.07168746 0.1566444 1.00000000
Below I generated a linear regression model showing the relationship between ‘x_square_feet’ and ‘price’. The dependent variable the one we’re observing the effects on is price and the independent variable is square feet. The line of best fit is shown across the data points representing the correlation coefficient through it’s slope. We can see it follows a positive correlation as the number of square feet increases so does the price. Following the cleaning and sub-setting of the data we were left with 1132 rows of observations and looked at 5 variables for their level of correlation. We would obtain a linear regression equation of y’=1412x+102169 from the parameter coefficients.
# x_square_feet predicting price
# generate linear regression model
price_x_sq_ft_lm <- lm(price ~ x_square_feet, data = rf_house)
price_x_sq_ft_lm
##
## Call:
## lm(formula = price ~ x_square_feet, data = rf_house)
##
## Coefficients:
## (Intercept) x_square_feet
## 102169 1412
summary(price_x_sq_ft_lm)
##
## Call:
## lm(formula = price ~ x_square_feet, data = rf_house)
##
## Residuals:
## Min 1Q Median 3Q Max
## -692136 -123900 -39307 93452 794904
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 102169.10 13055.07 7.826 1.15e-14 ***
## x_square_feet 1411.71 47.68 29.608 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 204600 on 1130 degrees of freedom
## Multiple R-squared: 0.4369, Adjusted R-squared: 0.4364
## F-statistic: 876.6 on 1 and 1130 DF, p-value: < 2.2e-16
# plot the linear regression model using ggplot2
rf_house %>%
ggplot(aes(x = x_square_feet, y = price)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title="Linear Regession Price by (x) Square Feet", x="Square Feet", y="Price") +
scale_y_continuous(labels = scales::comma)
## `geom_smooth()` using formula = 'y ~ x'
dim(rf_house)
## [1] 1132 5
In this regression model I wanted to see how the number of baths influenced the price of a home for sale. The dependent variable again was price while the independent variable was number of baths in a home. We can again see a postive correlation through the line of best fit in the data. This indicates as the number of beds in a home increases so does the price of the house. We would obtain a linear regression equation of y’=183894x+92333 from the parameter coefficients.
# baths predicting price
price_baths_lm <- lm(price ~ baths, data = rf_house)
price_baths_lm
##
## Call:
## lm(formula = price ~ baths, data = rf_house)
##
## Coefficients:
## (Intercept) baths
## 52497 197388
summary(price_baths_lm)
##
## Call:
## lm(formula = price ~ baths, data = rf_house)
##
## Residuals:
## Min 1Q Median 3Q Max
## -602047 -155383 -24885 128029 802728
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52497 19041 2.757 0.00593 **
## baths 197388 8966 22.015 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 228100 on 1130 degrees of freedom
## Multiple R-squared: 0.3002, Adjusted R-squared: 0.2995
## F-statistic: 484.6 on 1 and 1130 DF, p-value: < 2.2e-16
# plot the linear regression model using ggplot2
rf_house %>%
ggplot(aes(x = baths, y = price)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title="Linear Regession Price by number of Baths", x="Number of Baths", y="Price") +
scale_y_continuous(labels = scales::comma)
## `geom_smooth()` using formula = 'y ~ x'
Following creating the two regression models I went on to look at the R^2 values that were produced. The R^2 value is the measure of the proportional variance in the dependent variable, in this case price. It is used to assess the how well the independent variable explains the variability of the dependent variable. For the first regression model we looked at how ‘x_square_feet’ influenced ‘price’. We obtained an R^2 value of .437. In the range of 0 to 1 this number indicates that approximately 44% of the variability in the dependent variable can be explained by the independent variable in this model. This indicates there are other factors influencing the price of a home.
For the next linear regression model we followed a similar procedure but looked out how the number of baths influenced home prices. In this case we obtained an R^2 value 0.300 indicating 30% of variability for the price can be explained by number of baths in a home.
I then conducted a hypothesis test to see if R was significantly different from 0. This was the claim and our alternative hypothesis. With the null hypothesis being R was not significant different from 0. Using an alpha 0.05 we obtained a near zero p-value for ‘x_square_feet’ of 4.45e-143 and a p-value of 1.15e-89 for ‘baths’. In both cases we can reject the null hypothesis and say there is enough evidence to conclude R is different from zero.
Lastly, we can see there is a relationship between the correlation and the regression analysis. In our correlation analysis we obtain correlation coefficients of .66 for x_square_feet and .55 for number of baths. This tells us both are positively correlated where the independent variable increases so does the dependent variable. In our regression analysis we obtained R^2 values of .44 and .30 for square feet and number of baths respectively. This is given on a 0 to 1 scale where 1 (or 100%) is the best fit for the model. It shows us there is still a 44% and 30% variability in the dependent variable from our given independent variables. This tells us their is room for improvement for fitting the data to our regression models and the correlation may not be as strong as we original thought from our correlation analysis.
Based on this analysis we could recommend to real estate agents or home buyers that the number of baths in a home and the square feet both are contributing factors leading to higher priced homes. This could mean for individuals trying to sell their home at a higher price it could be worth the investment to add an additional bathroom to their home.
# calculate r-squared values
rf_house %>%
select_if(is.numeric) %>%
na.omit() %>%
pivot_longer(cols = -price) %>%
group_by(name) %>%
summarise(r2 = cor(value, price)^2) %>%
arrange(desc(r2))
## # A tibble: 4 × 2
## name r2
## <chr> <dbl>
## 1 x_square_feet 0.437
## 2 baths 0.300
## 3 lot_size 0.0563
## 4 beds 0.0350
# state the null and alternative hypothesis for the correlation test
# H0: r = 0
# Ha: r != 0
# CLAIM: r is significantly different from 0
# showing both the p-value/alpha test and the test statistic/critical value test
alpha <- 0.05
df <- nrow(rf_house) - 2
rf_house %>%
select_if(is.numeric) %>%
na.omit() %>%
pivot_longer(cols = -price) %>%
group_by(name) %>%
summarise(cor = cor(value, price), r2 = cor(value, price)^2, p = cor.test(value, price)$p.value, t = cor.test(value, price)$statistic, CV = qt(alpha, df = df)) %>%
arrange(desc(cor)) %>%
mutate(sig = ifelse(p < alpha, "p-val<alpha: r significant", "p-val>alpha: r not significant"),
test_stat = ifelse(abs(t) >
abs(CV), "t>CV: r significant", "t<CV: r not significant"))
## # A tibble: 4 × 8
## name cor r2 p t CV sig test_stat
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 x_square_feet 0.661 0.437 4.45e-143 29.6 -1.65 p-val<alpha: r sig… t>CV: r …
## 2 baths 0.548 0.300 1.15e- 89 22.0 -1.65 p-val<alpha: r sig… t>CV: r …
## 3 lot_size 0.237 0.0563 5.93e- 16 8.21 -1.65 p-val<alpha: r sig… t>CV: r …
## 4 beds 0.187 0.0350 2.18e- 10 6.41 -1.65 p-val<alpha: r sig… t>CV: r …
Kabacoff, R. I. (2015). R in Action (2nd ed.). Manning Publications.
Bluman, A. G. (2018). Elementary statistics: A step by step approach (10th ed.). McGraw Hill.