library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(dplyr)
library(ggcorrplot)

t_raw <- read_csv('ames.csv')
## Rows: 2930 Columns: 79
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (46): MSSubClass, MSZoning, Street, Alley, LotShape, LandContour, Utilit...
## dbl (33): LotFrontage, LotArea, YearBuilt, YearRemodAdd, MasVnrArea, BsmtFin...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
janitor:: clean_names(t_raw)
## # A tibble: 2,930 × 79
##    ms_sub_class           ms_zoning lot_frontage lot_area street alley lot_shape
##    <chr>                  <chr>            <dbl>    <dbl> <chr>  <chr> <chr>    
##  1 One_Story_1946_and_Ne… Resident…          141    31770 Pave   No_A… Slightly…
##  2 One_Story_1946_and_Ne… Resident…           80    11622 Pave   No_A… Regular  
##  3 One_Story_1946_and_Ne… Resident…           81    14267 Pave   No_A… Slightly…
##  4 One_Story_1946_and_Ne… Resident…           93    11160 Pave   No_A… Regular  
##  5 Two_Story_1946_and_Ne… Resident…           74    13830 Pave   No_A… Slightly…
##  6 Two_Story_1946_and_Ne… Resident…           78     9978 Pave   No_A… Slightly…
##  7 One_Story_PUD_1946_an… Resident…           41     4920 Pave   No_A… Regular  
##  8 One_Story_PUD_1946_an… Resident…           43     5005 Pave   No_A… Slightly…
##  9 One_Story_PUD_1946_an… Resident…           39     5389 Pave   No_A… Slightly…
## 10 Two_Story_1946_and_Ne… Resident…           60     7500 Pave   No_A… Regular  
## # ℹ 2,920 more rows
## # ℹ 72 more variables: land_contour <chr>, utilities <chr>, lot_config <chr>,
## #   land_slope <chr>, neighborhood <chr>, condition1 <chr>, condition2 <chr>,
## #   bldg_type <chr>, house_style <chr>, overall_qual <chr>, overall_cond <chr>,
## #   year_built <dbl>, year_remod_add <dbl>, roof_style <chr>, roof_matl <chr>,
## #   exterior1st <chr>, exterior2nd <chr>, mas_vnr_type <chr>,
## #   mas_vnr_area <dbl>, exter_qual <chr>, exter_cond <chr>, foundation <chr>, …
options(scipen = 10000)

test01 <- sample(x = 0:1,
                 size = 2930,
                 replace = TRUE,
                 prob = c(0.6, 0.4))

t <- t_raw %>% 
  mutate(log_yearremod = log(YearRemodAdd),
         is_test = test01) %>% 
  select(is_test, log_yearremod, FullBath, SalePrice) 

hist(t_raw$YearRemodAdd)

hist(t_raw$FullBath)

hist(t_raw$SalePrice)

t_train <- t %>% 
  filter(is_test == 0) %>% 
  select(-is_test)

t_test  <- t %>% 
  filter(is_test == 1) %>% 
  select(-is_test)

m_train <- lm(SalePrice ~ log_yearremod + FullBath, data = t_train)
summary(m_train)
## 
## Call:
## lm(formula = SalePrice ~ log_yearremod + FullBath, data = t_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -174126  -39682   -8557   25304  465795 
## 
## Coefficients:
##                Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)   -21383359    1207277  -17.71 <0.0000000000000002 ***
## log_yearremod   2828592     159286   17.76 <0.0000000000000002 ***
## FullBath          56124       3007   18.66 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 61910 on 1742 degrees of freedom
## Multiple R-squared:  0.4154, Adjusted R-squared:  0.4147 
## F-statistic: 618.8 on 2 and 1742 DF,  p-value: < 0.00000000000000022
rsme_train <- sqrt(mean(m_train$residuals^2))

t_test_predictions <- t_test %>% 
  mutate(predicted = predict(m_train, newdata = t_test),
         residuals = SalePrice - predicted,
         squared_residuals = residuals^2)

rsme_test <- sqrt(mean(t_test_predictions$residuals^2))

residual_sum_of_squares <- sum(m_train$residuals^2)

total_variation_in_model  <- t_train$SalePrice - mean(t_train$SalePrice)

total_sum_of_squares <- sum(total_variation_in_model^2)

r2 <- 1 - (residual_sum_of_squares / total_sum_of_squares)

ggcorrplot(cor(t),
           lab = T,
           colors = c('blue', 'white', 'red'))

## This data is information on sold houses. The csv file gives all information on each house. For example it lists when the house was built, how many floors it has, if it has a pool, and many may more. In this model I was trying to predict how much each house sold for. I used the year it was remodeled and how many full bathrooms the house has to predict the price. The year remodel had a weird distribution of data. Most of the houses were either built a really long time ago and not remodeled or remodeled around 2010. After the ggcorplot I found the year remodeled isn't very strongly correlated to Sale Price. In the ggcorplot I also found that the amount of full bathrooms also does not have a strong correlated to Sale Price. MOst of the houses had 1 or 2 full baths and the ones with three were mostly outliers. Overall my model was not good. I had a very low R Squared however, i did not have any systematic issues.