library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(dplyr)
library(ggcorrplot)
t_raw <- read_csv('ames.csv')
## Rows: 2930 Columns: 79
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (46): MSSubClass, MSZoning, Street, Alley, LotShape, LandContour, Utilit...
## dbl (33): LotFrontage, LotArea, YearBuilt, YearRemodAdd, MasVnrArea, BsmtFin...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
janitor:: clean_names(t_raw)
## # A tibble: 2,930 × 79
## ms_sub_class ms_zoning lot_frontage lot_area street alley lot_shape
## <chr> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 One_Story_1946_and_Ne… Resident… 141 31770 Pave No_A… Slightly…
## 2 One_Story_1946_and_Ne… Resident… 80 11622 Pave No_A… Regular
## 3 One_Story_1946_and_Ne… Resident… 81 14267 Pave No_A… Slightly…
## 4 One_Story_1946_and_Ne… Resident… 93 11160 Pave No_A… Regular
## 5 Two_Story_1946_and_Ne… Resident… 74 13830 Pave No_A… Slightly…
## 6 Two_Story_1946_and_Ne… Resident… 78 9978 Pave No_A… Slightly…
## 7 One_Story_PUD_1946_an… Resident… 41 4920 Pave No_A… Regular
## 8 One_Story_PUD_1946_an… Resident… 43 5005 Pave No_A… Slightly…
## 9 One_Story_PUD_1946_an… Resident… 39 5389 Pave No_A… Slightly…
## 10 Two_Story_1946_and_Ne… Resident… 60 7500 Pave No_A… Regular
## # ℹ 2,920 more rows
## # ℹ 72 more variables: land_contour <chr>, utilities <chr>, lot_config <chr>,
## # land_slope <chr>, neighborhood <chr>, condition1 <chr>, condition2 <chr>,
## # bldg_type <chr>, house_style <chr>, overall_qual <chr>, overall_cond <chr>,
## # year_built <dbl>, year_remod_add <dbl>, roof_style <chr>, roof_matl <chr>,
## # exterior1st <chr>, exterior2nd <chr>, mas_vnr_type <chr>,
## # mas_vnr_area <dbl>, exter_qual <chr>, exter_cond <chr>, foundation <chr>, …
options(scipen = 10000)
test01 <- sample(x = 0:1,
size = 2930,
replace = TRUE,
prob = c(0.6, 0.4))
t <- t_raw %>%
mutate(log_yearremod = log(YearRemodAdd),
is_test = test01) %>%
select(is_test, log_yearremod, FullBath, SalePrice)
hist(t_raw$YearRemodAdd)

hist(t_raw$FullBath)

hist(t_raw$SalePrice)

t_train <- t %>%
filter(is_test == 0) %>%
select(-is_test)
t_test <- t %>%
filter(is_test == 1) %>%
select(-is_test)
m_train <- lm(SalePrice ~ log_yearremod + FullBath, data = t_train)
summary(m_train)
##
## Call:
## lm(formula = SalePrice ~ log_yearremod + FullBath, data = t_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -174126 -39682 -8557 25304 465795
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -21383359 1207277 -17.71 <0.0000000000000002 ***
## log_yearremod 2828592 159286 17.76 <0.0000000000000002 ***
## FullBath 56124 3007 18.66 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 61910 on 1742 degrees of freedom
## Multiple R-squared: 0.4154, Adjusted R-squared: 0.4147
## F-statistic: 618.8 on 2 and 1742 DF, p-value: < 0.00000000000000022
rsme_train <- sqrt(mean(m_train$residuals^2))
t_test_predictions <- t_test %>%
mutate(predicted = predict(m_train, newdata = t_test),
residuals = SalePrice - predicted,
squared_residuals = residuals^2)
rsme_test <- sqrt(mean(t_test_predictions$residuals^2))
residual_sum_of_squares <- sum(m_train$residuals^2)
total_variation_in_model <- t_train$SalePrice - mean(t_train$SalePrice)
total_sum_of_squares <- sum(total_variation_in_model^2)
r2 <- 1 - (residual_sum_of_squares / total_sum_of_squares)
ggcorrplot(cor(t),
lab = T,
colors = c('blue', 'white', 'red'))

## This data is information on sold houses. The csv file gives all information on each house. For example it lists when the house was built, how many floors it has, if it has a pool, and many may more. In this model I was trying to predict how much each house sold for. I used the year it was remodeled and how many full bathrooms the house has to predict the price. The year remodel had a weird distribution of data. Most of the houses were either built a really long time ago and not remodeled or remodeled around 2010. After the ggcorplot I found the year remodeled isn't very strongly correlated to Sale Price. In the ggcorplot I also found that the amount of full bathrooms also does not have a strong correlated to Sale Price. MOst of the houses had 1 or 2 full baths and the ones with three were mostly outliers. Overall my model was not good. I had a very low R Squared however, i did not have any systematic issues.