This is a real dataset of house prices sold in Seattle, WA, USA between August and December 2022.
The dataset consists of a dataframe with 1665 observations with the following 6 variables:
1.beds = Number of bedrooms in property 2. region = based on zip code, north, central or south Seattle. 3. baths = Number of bathrooms in property. Note 0.5 corresponds to a half-bath which has a sink and toilet but no tub or shower 4. size = Total floor area of property in square feet 5. lot_size = Total area of the land where the property is located on in square feet 6. price = Price the property was sold for (US dollars) ##THIS IS THE RESPONSE VARIABLE
# reading the dataset in to RStudio
# Call the data table price_data
#https://drive.google.com/file/d/1LGxmER8s0PiGtrxoJA0fLCGU5IvCB4mh/view?usp=sharing
# Set up the actual file ID from the share link
file_id <- "1LGxmER8s0PiGtrxoJA0fLCGU5IvCB4mh"
data_url <- paste0("https://docs.google.com/uc?export=download&id=", file_id)
#Read the data into R
price_data <- read.csv(data_url, header = TRUE)
# Show the first few rows of data
head(price_data)
## beds Region baths size lot_size price
## 1 1 Central 1 704 500 645000
## 2 3 Central 3 1524 513 850000
## 3 3 Central 3 1524 513 875000
## 4 3 North 3 1450 525 750000
## 5 1 Central 1 480 560 275000
## 6 2 North 1 800 560 690000
# Exploratory Data Analysis
``` r
# Compute univariate numeric summary statistics
summary(price_data)
## beds Region baths size
## Min. : 1.000 Length:1665 Min. :0.500 Min. : 376
## 1st Qu.: 2.000 Class :character 1st Qu.:1.500 1st Qu.: 1260
## Median : 3.000 Mode :character Median :2.000 Median : 1720
## Mean : 3.126 Mean :2.298 Mean : 1896
## 3rd Qu.: 4.000 3rd Qu.:3.000 3rd Qu.: 2360
## Max. :15.000 Max. :9.000 Max. :11010
## lot_size price
## Min. : 500 Min. : 159488
## 1st Qu.: 2734 1st Qu.: 680000
## Median : 5000 Median : 865000
## Mean : 9673 Mean :1010483
## 3rd Qu.: 7350 3rd Qu.:1175000
## Max. :400752 Max. :6250000
# Univariate Charts--histograms for the quantitative variables
# use hist(price_data$VARNAME)
par(mfrow = c(3, 2))
hist(price_data$beds)
hist(price_data$baths)
hist(price_data$size)
hist(price_data$lot_size)
hist(price_data$price)
#Create the correlation matrix for the quantitative variables CHANGE THE DATA TABLE NAME
round(cor(price_data [,-2]),2)
## beds baths size lot_size price
## beds 1.00 0.59 0.73 -0.13 0.46
## baths 0.59 1.00 0.62 -0.08 0.54
## size 0.73 0.62 1.00 -0.07 0.74
## lot_size -0.13 -0.08 -0.07 1.00 -0.09
## price 0.46 0.54 0.74 -0.09 1.00
# Box plot CHANGE THE VARIABLES AND DATA TABLE
boxplot(price_data$price ~ price_data$Region)
# Box plots by vehicle type CHANGE THE VARIABLES AND DATA TABLE
par(mfrow = c(2, 2))
boxplot(price_data$beds ~ price_data$Region)
boxplot(price_data$baths ~ price_data$Region)
boxplot(price_data$size ~ price_data$Region)
boxplot(price_data$lot_size ~ price_data$Region)
## Explore the relationships between all predictor variables and the response
## CHANGE THE VARIABLES AND DATA TABLE AND THE XLAB AND YLAB
par(mfrow = c(2, 3))
plot(price_data$beds,price_data$price,xlab = "beds", ylab = "price")
plot(price_data$baths,price_data$price,xlab = "baths", ylab = "price")
plot(price_data$size,price_data$price,xlab = "size", ylab = "price")
plot(price_data$lot_size,price_data$price,xlab = "lot_size", ylab = "price")
plot(price_data$price,price_data$price,xlab = "price", ylab = "price")
# Fit a linear regression model CHANGE THE VARIABLES AND DATA TABLE
model1 <- lm(price ~ size + baths, data = price_data )
summary(model1)
##
## Call:
## lm(formula = price ~ size + baths, data = price_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1512769 -184967 -14583 139168 4682870
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 54645.49 24309.14 2.248 0.0247 *
## size 412.82 12.88 32.043 < 2e-16 ***
## baths 75416.75 11711.01 6.440 1.56e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 377700 on 1662 degrees of freedom
## Multiple R-squared: 0.5643, Adjusted R-squared: 0.5637
## F-statistic: 1076 on 2 and 1662 DF, p-value: < 2.2e-16
# Extract the standardized residuals
#NO CHANGES NEEDED
resids = rstandard(model1)
fits = model1$fitted
# Constant Variance Assumption
plot(fits, resids,
xlab="Fitted Values",
ylab="Residuals",
main="")
abline(0, 0, lty=2, lwd=2)
# CHANGE THE VARIABLES AND DATA TABLE
model2 <- lm (price ~ Region, price_data )
summary(model2)
##
## Call:
## lm(formula = price ~ Region, data = price_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -976039 -305368 -122368 149942 5023961
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1226039 26408 46.426 < 2e-16 ***
## RegionNorth -193671 33439 -5.792 8.31e-09 ***
## RegionSouth -425981 35779 -11.906 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 548900 on 1662 degrees of freedom
## Multiple R-squared: 0.07961, Adjusted R-squared: 0.0785
## F-statistic: 71.87 on 2 and 1662 DF, p-value: < 2.2e-16
# Extract the standardized residuals
#NO CHANGES NEEDED
resids2 = rstandard(model2)
fits2 = model2$fitted
# Constant Variance Assumption
plot(fits2, resids2,
xlab="Fitted Values",
ylab="Residuals",
main="")
abline(0, 0, lty=2, lwd=2)
# CHANGE THE VARIABLES AND DATA TABLE
model3 <- lm (price ~ Region + size + baths, data = price_data)
summary(model3)
##
## Call:
## lm(formula = price ~ Region + size + baths, data = price_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1478315 -174273 -25602 134080 4555225
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 237727.73 28270.85 8.409 < 2e-16 ***
## RegionNorth -127937.34 21863.00 -5.852 5.85e-09 ***
## RegionSouth -319851.40 23423.98 -13.655 < 2e-16 ***
## size 405.70 12.24 33.140 < 2e-16 ***
## baths 68780.22 11126.49 6.182 7.97e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 357700 on 1660 degrees of freedom
## Multiple R-squared: 0.6097, Adjusted R-squared: 0.6087
## F-statistic: 648.2 on 4 and 1660 DF, p-value: < 2.2e-16
# Extract the standardized residuals
#NO CHANGES NEEDED
resids3 = rstandard(model3)
fits3 = model3$fitted
# Constant Variance Assumption
plot(fits3, resids3,
xlab="Fitted Values",
ylab="Residuals",
main="")
abline(0, 0, lty=2, lwd=2)
# Return the R^2 values for model 1 and model 3
#NO CHANGES NEEDED
paste("Model 1 adjusted R^2:", round(summary(model1)$adj.r.squared,2))
## [1] "Model 1 adjusted R^2: 0.56"
paste("Model 3 adjusted R^2:",round(summary(model3)$adj.r.squared,2))
## [1] "Model 3 adjusted R^2: 0.61"
Type=“Sedan”, MPG=25, Weight=3000, Horsepower=250
Type=“SUV”, MPG=25, Weight=3000, Horsepower=250,
#new data
# CHANGE THE VARIABLES AND DATA TABLE AND VALUES
newvals1 <- data.frame(Region="North", size=704, baths=2)
newvals2 <- data.frame(Region="Central", size=1010, baths=3)
# Confidence Interval for the response variable
predict(model3,newvals1,interval='confidence',level=.95)
## fit lwr upr
## 1 532963.6 497329.5 568597.8
predict(model3,newvals2,interval='confidence',level=.95)
## fit lwr upr
## 1 853825.4 805391 902259.9
Comment on whether the plot shows constant variance.