Data Cleaning and EDA

Click to expand
rf <- read.csv("redfin_health_March2024.csv")

rf <- clean_names(rf)

# EDA with skimr and DataExplorer

rf_subset <- rf %>%
  select_if(is.numeric)

rf_subset_house <- rf_subset[2:10]
rf_subset_house_NA <- na.omit(rf_subset_house)

plot_missing(rf_subset_house)

plot_histogram(rf_subset_house)

plot_boxplot(rf_subset_house, by="price")
## Warning: Removed 7618 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

plot_correlation(rf_subset_house_NA)

plot_density(rf_subset_house)

rf_house <- rf |> dplyr::select(price, beds, baths, lot_size, x_square_feet)

# Removing outliers for Price

variable <- rf_house$price
variable <- na.omit(variable)
# Calculate the IQR
Q1 <- quantile(variable, 0.25)
Q3 <- quantile(variable, 0.75)
IQR <- Q3 - Q1

# Define the upper and lower bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
lower_bound <- as.integer(lower_bound)
upper_bound <- as.integer(upper_bound)
rf_house <- rf_house |> filter(price >= lower_bound & price <= upper_bound)

# removing outliers for x_square_feet

variable <- rf_house$x_square_feet
variable <- na.omit(variable)
# Calculate the IQR
Q1 <- quantile(variable, 0.25)
Q3 <- quantile(variable, 0.75)
IQR <- Q3 - Q1

# Define the upper and lower bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
lower_bound <- as.integer(lower_bound)
upper_bound <- as.integer(upper_bound)
rf_house <- rf_house |> filter(x_square_feet >= lower_bound & x_square_feet <= upper_bound)

# Removing outliers for number of baths

variable <- rf_house$baths
variable <- na.omit(variable)
# Calculate the IQR
Q1 <- quantile(variable, 0.25)
Q3 <- quantile(variable, 0.75)
IQR <- Q3 - Q1

# Define the upper and lower bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
lower_bound <- as.integer(lower_bound)
upper_bound <- as.integer(upper_bound)
rf_house <- rf_house |> filter(baths >= lower_bound & baths <= upper_bound)


# Removing outliers for number of beds

variable <- rf_house$beds
variable <- na.omit(variable)
# Calculate the IQR
Q1 <- quantile(variable, 0.25)
Q3 <- quantile(variable, 0.75)
IQR <- Q3 - Q1

# Define the upper and lower bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
lower_bound <- as.integer(lower_bound)
upper_bound <- as.integer(upper_bound)
rf_house <- rf_house |> filter(beds >= lower_bound & beds <= upper_bound)

# Removing outliers for lot size

variable <- rf_house$lot_size
variable <- na.omit(variable)
# Calculate the IQR
Q1 <- quantile(variable, 0.25)
Q3 <- quantile(variable, 0.75)
IQR <- Q3 - Q1

# Define the upper and lower bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
lower_bound <- as.integer(lower_bound)
upper_bound <- as.integer(upper_bound)
rf_house <- rf_house |> filter(lot_size >= lower_bound & lot_size <= upper_bound)

Problem 1

Below a correlation matrix can be viewed for the selected numerical variables related to housing. I also generated a correlation table to view the results in another format. We can see the highest correlations relating to price are R values of .547 for baths and .661 for x_square_feet. I selected these variables for the later regression models. As they appear to having the largest influence on the price a home will sell for. Something I found surprising is the number of beds in a house wasn’t more highly correlated with houses sales price. It had a correlation of 0.187. This was also the case for lot size at an R value 0.237. This can make sense because housing prices in more urban areas will tend to sell for a lot more than rural areas with larger lot sizes. I aimed to select variables in this portion that would have the greatest impact on housing prices and avoided including ones with very minimal correlation.

plot_correlation(rf_house)

correlation_table <- cor(rf_house)
correlation_table
##                   price       beds      baths   lot_size x_square_feet
## price         1.0000000  0.1872136 0.54786430  0.2372921    0.66096106
## beds          0.1872136  1.0000000 0.50967037 -0.0784709   -0.24563931
## baths         0.5478643  0.5096704 1.00000000  0.1238801    0.07168746
## lot_size      0.2372921 -0.0784709 0.12388006  1.0000000    0.15664436
## x_square_feet 0.6609611 -0.2456393 0.07168746  0.1566444    1.00000000

Problem 2

Linear Regression for Square Feet by Price

Below I generated a linear regression model showing the relationship between ‘x_square_feet’ and ‘price’. The dependent variable the one we’re observing the effects on is price and the independent variable is square feet. The line of best fit is shown across the data points representing the correlation coefficient through it’s slope. We can see it follows a positive correlation as the number of square feet increases so does the price. Following the cleaning and sub-setting of the data we were left with 1132 rows of observations and looked at 5 variables for their level of correlation. We would obtain a linear regression equation of y’=1412x+102169 from the parameter coefficients.

# x_square_feet predicting price
# generate linear regression model
price_x_sq_ft_lm <- lm(price ~ x_square_feet, data = rf_house)
price_x_sq_ft_lm
## 
## Call:
## lm(formula = price ~ x_square_feet, data = rf_house)
## 
## Coefficients:
##   (Intercept)  x_square_feet  
##        102169           1412
summary(price_x_sq_ft_lm)
## 
## Call:
## lm(formula = price ~ x_square_feet, data = rf_house)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -692136 -123900  -39307   93452  794904 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   102169.10   13055.07   7.826 1.15e-14 ***
## x_square_feet   1411.71      47.68  29.608  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 204600 on 1130 degrees of freedom
## Multiple R-squared:  0.4369, Adjusted R-squared:  0.4364 
## F-statistic: 876.6 on 1 and 1130 DF,  p-value: < 2.2e-16
# plot the linear regression model using ggplot2
rf_house %>%
  ggplot(aes(x = x_square_feet, y = price)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(title="Linear Regession Price by (x) Square Feet", x="Square Feet", y="Price") +
  scale_y_continuous(labels = scales::comma)
## `geom_smooth()` using formula = 'y ~ x'

dim(rf_house)
## [1] 1132    5

Linear Regression for number of Baths by Price

In this regression model I wanted to see how the number of baths influenced the price of a home for sale. The dependent variable again was price while the independent variable was number of baths in a home. We can again see a postive correlation through the line of best fit in the data. This indicates as the number of beds in a home increases so does the price of the house. We would obtain a linear regression equation of y’=183894x+92333 from the parameter coefficients.

# baths predicting price
price_baths_lm <- lm(price ~ baths, data = rf_house)
price_baths_lm
## 
## Call:
## lm(formula = price ~ baths, data = rf_house)
## 
## Coefficients:
## (Intercept)        baths  
##       52497       197388
summary(price_baths_lm)
## 
## Call:
## lm(formula = price ~ baths, data = rf_house)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -602047 -155383  -24885  128029  802728 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    52497      19041   2.757  0.00593 ** 
## baths         197388       8966  22.015  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 228100 on 1130 degrees of freedom
## Multiple R-squared:  0.3002, Adjusted R-squared:  0.2995 
## F-statistic: 484.6 on 1 and 1130 DF,  p-value: < 2.2e-16
# plot the linear regression model using ggplot2
rf_house %>%
  ggplot(aes(x = baths, y = price)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(title="Linear Regession Price by number of Baths", x="Number of Baths", y="Price") +
  scale_y_continuous(labels = scales::comma)
## `geom_smooth()` using formula = 'y ~ x'

Statistical outcomes of the Regression Models

Following creating the two regression models I went on to look at the R^2 values that were produced. The R^2 value is the measure of the proportional variance in the dependent variable, in this case price. It is used to assess the how well the independent variable explains the variability of the dependent variable. For the first regression model we looked at how ‘x_square_feet’ influenced ‘price’. We obtained an R^2 value of .437. In the range of 0 to 1 this number indicates that approximately 44% of the variability in the dependent variable can be explained by the independent variable in this model. This indicates there are other factors influencing the price of a home.

For the next linear regression model we followed a similar procedure but looked out how the number of baths influenced home prices. In this case we obtained an R^2 value 0.300 indicating 30% of variability for the price can be explained by number of baths in a home.

I then conducted a hypothesis test to see if R was significantly different from 0. This was the claim and our alternative hypothesis. With the null hypothesis being R was not significant different from 0. Using an alpha 0.05 we obtained a near zero p-value for ‘x_square_feet’ of 4.45e-143 and a p-value of 1.15e-89 for ‘baths’. In both cases we can reject the null hypothesis and say there is enough evidence to conclude R is different from zero.

Lastly, we can see there is a relationship between the correlation and the regression analysis. In our correlation analysis we obtain correlation coefficients of .66 for x_square_feet and .55 for number of baths. This tells us both are positively correlated where the independent variable increases so does the dependent variable. In our regression analysis we obtained R^2 values of .44 and .30 for square feet and number of baths respectively. This is given on a 0 to 1 scale where 1 (or 100%) is the best fit for the model. It shows us there is still a 44% and 30% variability in the dependent variable from our given independent variables. This tells us their is room for improvement for fitting the data to our regression models and the correlation may not be as strong as we original thought from our correlation analysis.

Based on this analysis we could recommend to real estate agents or home buyers that the number of baths in a home and the square feet both are contributing factors leading to higher priced homes. This could mean for individuals trying to sell their home at a higher price it could be worth the investment to add an additional bathroom to their home.

# calculate r-squared values
rf_house %>%
  select_if(is.numeric) %>%
  na.omit() %>%
  pivot_longer(cols = -price) %>%
  group_by(name) %>%
  summarise(r2 = cor(value, price)^2) %>%
  arrange(desc(r2))
## # A tibble: 4 × 2
##   name              r2
##   <chr>          <dbl>
## 1 x_square_feet 0.437 
## 2 baths         0.300 
## 3 lot_size      0.0563
## 4 beds          0.0350
# state the null and alternative hypothesis for the correlation test
# H0: r = 0
# Ha: r != 0
# CLAIM: r is significantly different from 0
# showing both the p-value/alpha test and the test statistic/critical value test
alpha <- 0.05
df <- nrow(rf_house) - 2
rf_house %>%
  select_if(is.numeric) %>%
  na.omit() %>%
  pivot_longer(cols = -price) %>%
  group_by(name) %>%
  summarise(cor = cor(value, price), r2 = cor(value, price)^2, p = cor.test(value, price)$p.value, t = cor.test(value, price)$statistic, CV = qt(alpha, df = df)) %>%
  arrange(desc(cor)) %>%
  mutate(sig = ifelse(p < alpha, "p-val<alpha: r significant", "p-val>alpha: r not significant"),
         test_stat = ifelse(abs(t) > 
                              abs(CV), "t>CV: r significant", "t<CV: r not significant"))
## # A tibble: 4 × 8
##   name            cor     r2         p     t    CV sig                 test_stat
##   <chr>         <dbl>  <dbl>     <dbl> <dbl> <dbl> <chr>               <chr>    
## 1 x_square_feet 0.661 0.437  4.45e-143 29.6  -1.65 p-val<alpha: r sig… t>CV: r …
## 2 baths         0.548 0.300  1.15e- 89 22.0  -1.65 p-val<alpha: r sig… t>CV: r …
## 3 lot_size      0.237 0.0563 5.93e- 16  8.21 -1.65 p-val<alpha: r sig… t>CV: r …
## 4 beds          0.187 0.0350 2.18e- 10  6.41 -1.65 p-val<alpha: r sig… t>CV: r …

Work Cited

Kabacoff, R. I. (2015). R in Action (2nd ed.). Manning Publications.

Bluman, A. G. (2018). Elementary statistics: A step by step approach (10th ed.). McGraw Hill.