# Execute the line below this comment (Ctrl Enter)
# to make the Homework functions available to you (requirred!).
source("https://www.cpp.edu/~clange/RData/Homework/FunctionsForHomework.R")
# In the line below this comment
# substitute the string admin with your Bronco user name in lower case.
# Do not delete the "" quotes!
# Then execute the line below (Ctrl Enter).
SetUpHomework("marreola",20)

# Note, you might have to install "janitor" before you can 
# load it with: library(janitor)
# Do not install with the install_package() command!!!
# Instead in R Studio click on the "Tools" item.
# Tools -> Install Packages ...
# Add janitor in the field of the dialog box.
# Click Install
library(janitor)

# Note, you might have to install "tidymodels" before you can 
# load it with: library(tidymodels)
# Do not install with the install_package() command!!!
# Instead in R Studio click on the "Tools" item.
# Tools -> Install Packages ...
# Add tidymodels in the field of the dialog box.
# Click Install
library(tidymodels)

# Execute the lines below this comment (Ctrl Enter) and
# your data for this homework will be loaded in the dataframe DataHousing.
# It will be a sample of 5000 records from the complete dataset.
# Be patient (can take up to 5 minutes) because initially many 
# thousand records are downloaded.
DataHousing <-  
  read_csv("https://www.cpp.edu/~clange/AiBook/HousingData.csv")%>% 
  clean_names("upper_camel") %>% sample_n(5000)
################
#Question 1: Run a regression with the 
# house price as outcome 
# variable and SqftLiving as a predictor variable. 
# Note, you have to check for the correct variable name
# for the house price. 
# Save the results of your regression in the variable "ModelSqft".
#
# If a house's living sqft space increases by
# one sqft by how much would the est. house price increase.
# This is the answer to question 1 (in dollar without the $ sign. 
# E.g. 123.7653 but not $123.76.  
# The code is partially displayed below. Substitute the @@@ for
# the correct code before you run it.

QuestionNumber=1
ModelSqft=lm(Price ~ SqftLiving, 
                     data=DataHousing)

summary(ModelSqft)
## 
## Call:
## lm(formula = Price ~ SqftLiving, data = DataHousing)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1672170  -152315   -24816   110163  4190803 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -73238.089   9638.367  -7.599 3.55e-14 ***
## SqftLiving     297.297      4.194  70.889  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 279300 on 4998 degrees of freedom
## Multiple R-squared:  0.5014, Adjusted R-squared:  0.5013 
## F-statistic:  5025 on 1 and 4998 DF,  p-value: < 2.2e-16
Answer1= 297.297
######## Question 2
# Based on your results from Question 1
# If an addition is build to the house which adds 250 sqft to the house,
# by how much would the predicted house price increase.
# This is answer to question 2 (in dollar without the $ sign. 
# E.g. 123.7653 but not $123.76. 
QuestionNumber=2


Answer2= 250 * Answer1
Answer2
## [1] 74324.25
#### Question 3
# Use the ifelse() command to add a dummy variable to the
# dataframe DataHousing. The dummy should be 1, if the house is at the 
# waterfront and 0 otherwise. Call the
# variable DummyWf.
# The answer for question 3 is: 
# How many houses in your dataset are at the waterfront.
# Hint, because you assigned a 1 for a house at the waterfront
# and 0 for one that is not at the waterfront you can utilize
# the sum command.

QuestionNumber=3

DataHousing=DataHousing %>% 
     mutate(DummyWf=ifelse(Waterfront=="yes",1,0))

Answer3=sum(DataHousing$DummyWf)
Answer3
## [1] 36
########Question 4: 
# Run a regression with the house price as the outcome variable.
# The predictor variables are the SqftLiving and the dummy variable 
# you created for a house being at the waterfront.
# The answer to question 4 is how much a waterfront property 
# adds to the estimated house price (in dollar without the $ sign. 
# E.g. 123.7653 but not $123.76).  
# The code is partially displayed below. Substitute the @@@ for
# the correct code before you run it.

QuestionNumber=4

ModelWithDummy=lm(Price~SqftLiving+DummyWf, data=DataHousing)

summary(ModelWithDummy)
## 
## Call:
## lm(formula = Price ~ SqftLiving + DummyWf, data = DataHousing)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1578174  -147605   -23541   112188  4273299 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -62732.555   9384.072  -6.685 2.56e-11 ***
## SqftLiving     289.580      4.099  70.646  < 2e-16 ***
## DummyWf     788068.637  45664.757  17.258  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 271400 on 4997 degrees of freedom
## Multiple R-squared:  0.5294, Adjusted R-squared:  0.5292 
## F-statistic:  2811 on 2 and 4997 DF,  p-value: < 2.2e-16
Answer4= 788068.637
########Question 5: 
# Use the results from question 4.  
# The answer to question 5 is how much an extra SqftLiving 
# adds to the estimated house price (in dollar without the $ sign. 
# E.g. 123.7653 but not $123.76).  
QuestionNumber=5

Answer5= 289.580 
########Question 6: 
# Below is the definition for an error function for the MSE (in billions).
# Complete the definition for the error function and execute
# the code.
# In the following line you find a command that calculates
# the MSE in billions  for an intercept of negative 30,000, 
# a SqftLiving coefficient of 270 and a coefficient for the
# waterfront dummy of 800,000 based on the error function. 
# Execute that command. The result is the answer 
# for the related MSE in billion
# and the answer to question 6.
N=nrow(DataHousing)

FctMseErrorInBillions=function(b0,b1,b2)
  {sum((DataHousing$Price-
     (289.580*DataHousing$SqftLiving+788068.637*DataHousing$DummyWf+-62732.555))^2
    ) /N/1000000000
}

Answer6=FctMseErrorInBillions(-30000,270,900000)
Answer6
## [1] 73.59787