# Execute the line below this comment (Ctrl Enter)
# to make the Homework functions available to you (requirred!).
source("https://www.cpp.edu/~clange/RData/Homework/FunctionsForHomework.R")
# In the line below this comment
# substitute the string admin with your Bronco user name in lower case.
# Do not delete the "" quotes!
# Then execute the line below (Ctrl Enter).
SetUpHomework("marreola",20)
# Note, you might have to install "janitor" before you can
# load it with: library(janitor)
# Do not install with the install_package() command!!!
# Instead in R Studio click on the "Tools" item.
# Tools -> Install Packages ...
# Add janitor in the field of the dialog box.
# Click Install
library(janitor)
# Note, you might have to install "tidymodels" before you can
# load it with: library(tidymodels)
# Do not install with the install_package() command!!!
# Instead in R Studio click on the "Tools" item.
# Tools -> Install Packages ...
# Add tidymodels in the field of the dialog box.
# Click Install
library(tidymodels)
# Execute the lines below this comment (Ctrl Enter) and
# your data for this homework will be loaded in the dataframe DataHousing.
# It will be a sample of 5000 records from the complete dataset.
# Be patient (can take up to 5 minutes) because initially many
# thousand records are downloaded.
DataHousing <-
read_csv("https://www.cpp.edu/~clange/AiBook/HousingData.csv")%>%
clean_names("upper_camel") %>% sample_n(5000)
################
#Question 1: Run a regression with the
# house price as outcome
# variable and SqftLiving as a predictor variable.
# Note, you have to check for the correct variable name
# for the house price.
# Save the results of your regression in the variable "ModelSqft".
#
# If a house's living sqft space increases by
# one sqft by how much would the est. house price increase.
# This is the answer to question 1 (in dollar without the $ sign.
# E.g. 123.7653 but not $123.76.
# The code is partially displayed below. Substitute the @@@ for
# the correct code before you run it.
QuestionNumber=1
ModelSqft=lm(Price ~ SqftLiving,
data=DataHousing)
summary(ModelSqft)
##
## Call:
## lm(formula = Price ~ SqftLiving, data = DataHousing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1672170 -152315 -24816 110163 4190803
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -73238.089 9638.367 -7.599 3.55e-14 ***
## SqftLiving 297.297 4.194 70.889 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 279300 on 4998 degrees of freedom
## Multiple R-squared: 0.5014, Adjusted R-squared: 0.5013
## F-statistic: 5025 on 1 and 4998 DF, p-value: < 2.2e-16
Answer1= 297.297
######## Question 2
# Based on your results from Question 1
# If an addition is build to the house which adds 250 sqft to the house,
# by how much would the predicted house price increase.
# This is answer to question 2 (in dollar without the $ sign.
# E.g. 123.7653 but not $123.76.
QuestionNumber=2
Answer2= 250 * Answer1
Answer2
## [1] 74324.25
#### Question 3
# Use the ifelse() command to add a dummy variable to the
# dataframe DataHousing. The dummy should be 1, if the house is at the
# waterfront and 0 otherwise. Call the
# variable DummyWf.
# The answer for question 3 is:
# How many houses in your dataset are at the waterfront.
# Hint, because you assigned a 1 for a house at the waterfront
# and 0 for one that is not at the waterfront you can utilize
# the sum command.
QuestionNumber=3
DataHousing=DataHousing %>%
mutate(DummyWf=ifelse(Waterfront=="yes",1,0))
Answer3=sum(DataHousing$DummyWf)
Answer3
## [1] 36
########Question 4:
# Run a regression with the house price as the outcome variable.
# The predictor variables are the SqftLiving and the dummy variable
# you created for a house being at the waterfront.
# The answer to question 4 is how much a waterfront property
# adds to the estimated house price (in dollar without the $ sign.
# E.g. 123.7653 but not $123.76).
# The code is partially displayed below. Substitute the @@@ for
# the correct code before you run it.
QuestionNumber=4
ModelWithDummy=lm(Price~SqftLiving+DummyWf, data=DataHousing)
summary(ModelWithDummy)
##
## Call:
## lm(formula = Price ~ SqftLiving + DummyWf, data = DataHousing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1578174 -147605 -23541 112188 4273299
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -62732.555 9384.072 -6.685 2.56e-11 ***
## SqftLiving 289.580 4.099 70.646 < 2e-16 ***
## DummyWf 788068.637 45664.757 17.258 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 271400 on 4997 degrees of freedom
## Multiple R-squared: 0.5294, Adjusted R-squared: 0.5292
## F-statistic: 2811 on 2 and 4997 DF, p-value: < 2.2e-16
Answer4= 788068.637
########Question 5:
# Use the results from question 4.
# The answer to question 5 is how much an extra SqftLiving
# adds to the estimated house price (in dollar without the $ sign.
# E.g. 123.7653 but not $123.76).
QuestionNumber=5
Answer5= 289.580
########Question 6:
# Below is the definition for an error function for the MSE (in billions).
# Complete the definition for the error function and execute
# the code.
# In the following line you find a command that calculates
# the MSE in billions for an intercept of negative 30,000,
# a SqftLiving coefficient of 270 and a coefficient for the
# waterfront dummy of 800,000 based on the error function.
# Execute that command. The result is the answer
# for the related MSE in billion
# and the answer to question 6.
N=nrow(DataHousing)
FctMseErrorInBillions=function(b0,b1,b2)
{sum((DataHousing$Price-
(289.580*DataHousing$SqftLiving+788068.637*DataHousing$DummyWf+-62732.555))^2
) /N/1000000000
}
Answer6=FctMseErrorInBillions(-30000,270,900000)
Answer6
## [1] 73.59787