library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
library(broom)
library(lindia)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
url <- "https://raw.githubusercontent.com/leontoddjohnson/i590/main/data/apartments/apartments.csv"
apts <- read_delim(url, delim = ',')
## Rows: 492 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): in_sf, beds, bath, price, year_built, sqft, price_per_sqft, elevation
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
view (apts)
model <- lm(price ~ sqft,
filter(apts, in_sf == 0))
rsquared <- summary(model)$r.squared
apts |>
filter(in_sf == 0) |>
ggplot(mapping = aes(x = sqft,
y = price)) +
geom_point() +
geom_smooth(method = 'lm', color = 'gray', linetype = 'dashed', # model fit - grey , blue = how we want the data to be fit, so that desirable data is obtained
se = FALSE) +
geom_smooth(se = FALSE) +
labs(title = "Price vs. sqft",
subtitle = paste("Linear Fit R-Squared =", round(rsquared, 3))) +
theme_classic()
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Since Week 10 was demonstrating about Transforming Variables, the explanation could have started with a plot suggesting Linear Model. Price vs Sqft would have been the best choice to showcase the same.
The variables selected for Linear Model could have been better. We observed the combination of Price with sqft gave a bit more linear relation when compared to price with price_per_sqft.
No Transformations were required with the above choice of variables.
model <- lm(price_per_sqft ~ beds,
filter(apts, in_sf == 0))
rsquared <- summary(model)$r.squared
apts |>
filter(in_sf == 0) |>
ggplot(mapping = aes(x = beds,
y = price_per_sqft)) +
geom_point() +
geom_smooth(method = 'lm', color = 'gray', linetype = 'dashed', # model fit - grey , blue = how we want the data to be fit, so that desirable data is obtained
se = FALSE) +
geom_smooth(se = FALSE) +
labs(title = "price_per_sqft vs. beds",
subtitle = paste("Linear Fit R-Squared =", round(rsquared, 3))) +
theme_classic()
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 1.6637e-16
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 1
model <- lm(price_per_sqft ~ elevation,
filter(apts, in_sf == 0))
rsquared <- summary(model)$r.squared
apts |>
filter(in_sf == 0) |>
ggplot(mapping = aes(x = elevation,
y = price_per_sqft)) +
geom_point() +
geom_smooth(method = 'lm', color = 'gray', linetype = 'dashed', # model fit - grey , blue = how we want the data to be fit, so that desirable data is obtained
se = FALSE) +
geom_smooth(se = FALSE) +
labs(title = "price_per_sqft vs. elevation",
subtitle = paste("Linear Fit R-Squared =", round(rsquared, 3))) +
theme_classic()
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
model <- lm(price ~ elevation,
filter(apts, in_sf == 0))
rsquared <- summary(model)$r.squared
apts |>
filter(in_sf == 0) |>
ggplot(mapping = aes(x = elevation,
y = price)) +
geom_point() +
geom_smooth(method = 'lm', color = 'gray', linetype = 'dashed', # model fit - grey , blue = how we want the data to be fit, so that desirable data is obtained
se = FALSE) +
geom_smooth(se = FALSE) +
labs(title = "price vs. elevation",
subtitle = paste("Linear Fit R-Squared =", round(rsquared, 3))) +
theme_classic()
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
model <- lm(price ~ year_built,
filter(apts, in_sf == 0))
rsquared <- summary(model)$r.squared
apts |>
filter(in_sf == 0) |>
ggplot(mapping = aes(x = year_built,
y = price)) +
geom_point() +
geom_smooth(method = 'lm', color = 'gray', linetype = 'dashed', # model fit - grey , blue = how we want the data to be fit, so that desirable data is obtained
se = FALSE) +
geom_smooth(se = FALSE) +
labs(title = "price vs. year_built",
subtitle = paste("Linear Fit R-Squared =", round(rsquared, 3))) +
theme_classic()
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
If we question the following -> What could happen to the above different model if it deployed on a platform like Zillow? - The data set wont work well with linear model (as seen with above plots). If we put this model in the real world. It wont help both the company and buyers to make a choice on how prices fluctuate based on various factors.
Suggestion : To maybe add certain explanation/ cheat code about various lines and their expressions. This would be helpful to mutate the variables for transforming the data for plotting linear models.
Unable to do transformation on other variables for example, beds, baths, and elevation. Since the model fits linearly with only price or price per sqft vs sqft. Being able to do transformation to make the linear model fit better will be difficult to test. So as students or even the organization could not experiment on all variables to test this dataset.