Complete all Exercises, and submit answers to Questions on the Coursera platform.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
This initial quiz will concern exploratory data analysis (EDA) of the Ames Housing dataset. EDA is essential when working with any source of data and helps inform modeling.
First, let us load the data:
load("ames_train.Rdata")
Misc.Feature
, Fence
, Pool.QC
Misc.Feature
, Alley
, Pool.QC
Pool.QC
, Alley
, Fence
Fireplace.Qu
, Pool.QC
, Lot.Frontage
# type your code for Question 1 here, and Knit
na_count <- colSums(is.na(ames_train))
head(sort(na_count, decreasing = TRUE),3)
## Pool.QC Misc.Feature Alley
## 997 971 933
int
? Change them to factors when conducting your analysis.
# type your code for Question 2 here, and Knit
ames_train$Overall.Qual <- factor(ames_train$Overall.Qual,ordered = TRUE)
ames_train$Overall.Cond <- factor(ames_train$Overall.Cond,ordered = TRUE)
StoneBr
Timber
Veenker
NridgHt
# type your code for Question 3 here, and Knit
ames_train %>% group_by(Neighborhood) %>% summarise(sd=sd(price)) %>% arrange(desc(sd))
## # A tibble: 27 x 2
## Neighborhood sd
## <fctr> <dbl>
## 1 StoneBr 123459
## 2 NridgHt 105089
## 3 Timber 84030
## 4 Veenker 72545
## 5 Crawfor 71268
## 6 GrnHill 70711
## 7 Somerst 65199
## 8 Edwards 54852
## 9 CollgCr 52786
## 10 SawyerW 48354
## # ... with 17 more rows
price
?
Lot.Area
Bedroom.AbvGr
Overall.Qual
Year.Built
# type your code for Question 4 here, and Knit
p1 <- ggplot(ames_train, aes(y=price, x=Lot.Area)) +
geom_point() +
geom_smooth(method="lm")
p2 <- ggplot(ames_train, aes(y=price, x=Bedroom.AbvGr)) +
geom_jitter() +
geom_smooth(method="lm")
p3 <- ggplot(ames_train, aes(y=price, x=Overall.Qual)) +
geom_jitter() +
geom_smooth(method="lm")
p4 <- ggplot(ames_train, aes(y=price, x=Year.Built)) +
geom_point() +
geom_smooth(method="lm")
grid.arrange(p1, p2, p3, p4, ncol=2)
price
and area
. Which of the following variable transformations makes the relationship appear to be the most linear?
price
or area
price
but not area
area
but not price
price
and area
# type your code for Question 5 here, and Knit
p1 <- ggplot(ames_train, aes(y=price, x=area)) +
geom_point() +
geom_smooth(method="lm")
p2 <- ggplot(ames_train, aes(y=log(price), x=area)) +
geom_jitter() +
geom_smooth(method="lm")
p3 <- ggplot(ames_train, aes(y=price, x=log(area))) +
geom_jitter() +
geom_smooth(method="lm")
p4 <- ggplot(ames_train, aes(y=log(price), x=log(area))) +
geom_point() +
geom_smooth(method="lm")
grid.arrange(p1, p2, p3, p4, ncol=2)
# type your code for Question 6 here, and Knit
x <- ames_train %>% filter(Garage.Cars >= 1) %>% summarise(Garage.Cars.ge1=n())
n <- ames_train %>% filter(!is.na(Garage.Cars)) %>% summarise(Garage.Cars.ge1=n())
print(paste("Beta(",9+x,", ",1+n-x,")"))
## [1] "Beta( 962 , 47 )"
# type your code for Question 7 here, and Knit
ames_train %>% filter(Year.Built>1999) %>% summarise(yr_built_gt99=n())
## # A tibble: 1 x 1
## yr_built_gt99
## <int>
## 1 272
ames_train %>% filter(!is.na(Year.Built)) %>% summarise(yr_built_any=n())
## # A tibble: 1 x 1
## yr_built_any
## <int>
## 1 1000
ames_train %>% summarise(median=median(price), mean=mean(price))
## # A tibble: 1 x 2
## median mean
## <dbl> <dbl>
## 1 159467 181190
ames_train %>% filter(Total.Bsmt.SF==0) %>% summarise(no_basement=n())
## # A tibble: 1 x 1
## no_basement
## <int>
## 1 21
ames_train %>% group_by(Street) %>% summarise(n=n())
## # A tibble: 2 x 2
## Street n
## <fctr> <int>
## 1 Grvl 3
## 2 Pave 997
# type your code for Question 8 here, and Knit
ames_train$Has.Garage = ifelse(ames_train$Garage.Area>0,1,0)
ames_train$Has.Garage <- factor(ames_train$Has.Garage)
t.test(area ~ Has.Garage, data = ames_train)
##
## Welch Two Sample t-test
##
## data: area by Has.Garage
## t = -5.134, df = 50.702, p-value = 4.535e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -482.9963 -211.4183
## sample estimates:
## mean in group 0 mean in group 1
## 1145.043 1492.251
# type your code for Question 9 here, and Knit
# Prior is lambda=3 and sigma_sq=1. Then b=3/1=3 and a=3*3=9. Observe sum_x=501 above ground bedrooms out of n=138 homes. The posterior probability distribution parameters are a_star=a+sum_x=9+501=510 and b_star=b+n=3+138=141.
lambda <- 3
sigma_sq <- 1
b <- lambda/sigma_sq
a <- lambda * b
sum_x <- ames_train %>% filter(area>2000) %>% summarise(sum_x=sum(Bedroom.AbvGr))
n <- ames_train %>% filter(area>2000) %>% summarise(n=n())
a_star <- a + sum_x
b_star <- b + n
lambda_star <- a_star / b_star
sigma_star <- sqrt(a_star / b_star^2)
print(paste("Mean: ",lambda_star,", SD: ",sigma_star))
## [1] "Mean: 3.61702127659574 , SD: 0.160164394193421"
price
) on \(\log\)(area
), there are some outliers. Which of the following do the three most outlying points have in common?
# type your code for Question 10 here, and Knit
fit <- lm(log(price)~log(area), data=ames_train)
par(mfrow = c(2,2)) # Split plot panel into 2x2 grid
plot(fit)
par(mfrow = c(1,1)) # Return plot panel to 1x1 grid
ames_train$stdres<-stdres(fit)
ames_train$stdres_abs <- abs(stdres(fit))
ames_train$stdres_gt2 <- abs(stdres(fit))>3
ames_train[which(ames_train$stdres_gt2==TRUE),
c('Bedroom.AbvGr','Overall.Qual','Year.Built','Sale.Condition',
'stdres_abs')] %>% arrange(desc(stdres_abs)) %>% head(n=3)
## # A tibble: 3 x 5
## Bedroom.AbvGr Overall.Qual Year.Built Sale.Condition stdres_abs
## <int> <ord> <int> <fctr> <dbl>
## 1 2 2 1923 Abnorml 7.37
## 2 3 4 1920 Normal 4.83
## 3 3 4 1910 Abnorml 4.43
price
if used as a dependent variable in a linear regression?
price
is right-skewed.
price
cannot take on negative values.
price
can only take on integer values.# type your code for Question 11 here, and Knit
ggplot(ames_train, aes(x=price)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Bldg.Type
= 1Fam
)
# type your code for Question 12 here, and Knit
ames_train %>% group_by(Neighborhood) %>%
summarise(mean.Bldg.Type = mean(Bldg.Type == "1Fam")) %>%
filter(mean.Bldg.Type==1)
## # A tibble: 3 x 2
## Neighborhood mean.Bldg.Type
## <fctr> <dbl>
## 1 ClearCr 1.00
## 2 NoRidge 1.00
## 3 Timber 1.00
area
) and the number of bedrooms above ground (Bedroom.AbvGr
)?
# type your code for Question 13 here, and Knit
ggplot (ames_train, aes(x=Bedroom.AbvGr, y=log(area))) +
geom_jitter()
# type your code for Question 14 here, and Knit
ames_train %>% filter(!is.na(Bsmt.Unf.SF), Bsmt.Unf.SF>0) %>% summarise(mean=mean(Bsmt.Unf.SF))
## # A tibble: 1 x 1
## mean
## <dbl>
## 1 595