Complete all Exercises, and submit answers to Questions on the Coursera platform.
This initial quiz will concern exploratory data analysis (EDA) of the Ames Housing dataset. EDA is essential when working with any source of data and helps inform modeling.
First, let us load the data:
load("~/ A statistical with R /Capston /EDA week1 /ames_train.Rdata")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(statsr)
library(ggplot2)
library(ggmosaic)
## Loading required package: productplots
##
## Attaching package: 'ggmosaic'
## The following objects are masked from 'package:productplots':
##
## ddecker, hspine, mosaic, prodcalc, spine, vspine
Misc.Feature, Fence, Pool.QC
Misc.Feature, Alley, Pool.QC
Pool.QC, Alley, Fence
Fireplace.Qu, Pool.QC, Lot.Frontage
missing_val <- ames_train %>% summarise_all(funs(sum(is.na(.))))
head(sort(unlist(missing_val[1,]), decreasing = TRUE), 3)
## Pool.QC Misc.Feature Alley
## 997 971 933
int? Change them to factors when conducting your analysis.
ames_var_types <- split(names(ames_train), sapply(ames_train, function(x) paste(class(x), collapse = " ")))
sapply(ames_var_types$integer, function(y) nrow(unique(select(ames_train, y))))
## PID area price MS.SubClass
## 1000 686 535 15
## Lot.Frontage Lot.Area Overall.Qual Overall.Cond
## 105 785 10 9
## Year.Built Year.Remod.Add Mas.Vnr.Area BsmtFin.SF.1
## 102 61 259 529
## BsmtFin.SF.2 Bsmt.Unf.SF Total.Bsmt.SF X1st.Flr.SF
## 106 606 594 624
## X2nd.Flr.SF Low.Qual.Fin.SF Bsmt.Full.Bath Bsmt.Half.Bath
## 287 12 5 4
## Full.Bath Half.Bath Bedroom.AbvGr Kitchen.AbvGr
## 5 3 7 3
## TotRms.AbvGrd Fireplaces Garage.Yr.Blt Garage.Cars
## 12 5 94 7
## Garage.Area Wood.Deck.SF Open.Porch.SF Enclosed.Porch
## 371 215 176 101
## X3Ssn.Porch Screen.Porch Pool.Area Misc.Val
## 15 57 4 18
## Mo.Sold Yr.Sold
## 12 5
StoneBr
Timber
Veenker
NridgHt
ames_train %>% select(Neighborhood, price) %>% group_by(Neighborhood) %>% summarise(Avg_price = mean(price, na.rm=TRUE), std_dev = sd(price, na.rm=TRUE)) %>% arrange(desc(std_dev))
ggplot(ames_train, aes(x=Neighborhood, y=price, fill=Neighborhood))+geom_boxplot() + theme(axis.text.x = element_text(angle = 90, hjust = 1))
price?
Lot.Area
Bedroom.AbvGr
Overall.Qual
Year.Built
p <- ggplot(data = ames_train, aes(x=Lot.Area, y=price/100000))
p+geom_point()+ stat_smooth(method = "lm", col="red") +labs(title="Prices by Lot Area", x="Lot Area", y="Prices in 100K") + theme(plot.title = element_text(size = rel(2), color = "blue", hjust = 0.5, vjust = 0))
ggplot(data = ames_train, aes(x=Bedroom.AbvGr, y=price/100000))+geom_point()+ stat_smooth(method = "lm", col="red") +labs(title="Prices by houses with Bedroom Above Grade", x="Bedroom Above Grade", y="Prices in 100K") + theme(plot.title = element_text(size = rel(2), color = "blue", hjust = 0.5, vjust = 0))
ggplot(data = ames_train, aes(x=Overall.Qual, y=price/100000))+geom_point()+ stat_smooth(method = "lm", col="red") +labs(title="Prices by houses overall quality", x="Overall quality", y="Prices in 100K") + theme(plot.title = element_text(size = rel(2), color = "blue", hjust = 0.5, vjust = 0))
ames_train %>% filter(Overall.Qual==9, price<200000)
ggplot(data = ames_train, aes(x=Year.Built, y=price/100000))+geom_point()+ stat_smooth(method = "lm", col="red") +labs(title="Prices by houses built year", x="Year built", y="Prices in 100K") + theme(plot.title = element_text(size = rel(2), color = "blue", hjust = 0.5, vjust = 0))
Based on all the above plots, we house overall quality seems to be the best predictors compare to other variables mentioned in this section.
price and area. Which of the following variable transformations makes the relationship appear to be the most linear?
price or area
price but not area
area but not price
price and area
par(mfrow=c(2,2))
p <- ggplot(data = ames_train, aes(x=Lot.Area, y=price/100000))
p+geom_point()+ stat_smooth(method = "lm", col="red") +labs(title="Prices by Lot Area", x="Lot Area", y="Prices in 100K") + theme(plot.title = element_text(size = rel(2), color = "blue", hjust = 0.5, vjust = 0))
p <- ggplot(data = ames_train, aes(x=log(Lot.Area), y=price/100000))
p+geom_point()+ stat_smooth(method = "lm", col="red") +labs(title="Prices by Log Lot Area", x="Lot Area with Log transformation", y="Prices in 100K") + theme(plot.title = element_text(size = rel(2), color = "blue", hjust = 0.5, vjust = 0))
p <- ggplot(data = ames_train, aes(x=log(Lot.Area), y=log(price/100000)))
p+geom_point()+ stat_smooth(method = "lm", col="red") +labs(title="Log Prices by Log Lot Area", x="Lot Area with Log transformation", y="Log Prices in 100K") + theme(plot.title = element_text(size = rel(2), color = "blue", hjust = 0.5, vjust = 0))
p <- ggplot(data = ames_train, aes(x=Lot.Area, y=log(price/100000)))
p+geom_point()+ stat_smooth(method = "lm", col="red") +labs(title="Log Prices by Lot Area", x="Lot Area", y="Log Prices in 100K") + theme(plot.title = element_text(size = rel(2), color = "blue", hjust = 0.5, vjust = 0))
alpha <- ames_train %>% filter(!is.na(Garage.Type)) %>% nrow()
beta <- nrow(ames_train)-alpha
paste("Our posterior beta will be: beta(", alpha+9, "," ,beta+1,")", sep = "", collapse = " ")
## [1] "Our posterior beta will be: beta(963,47)"
n <- nrow(ames_train)
ames_train %>% group_by(Year.Built>1999) %>% summarise(Yr_built=paste(round(100*n()/n, 2), "%"))
avg_price <- mean(ames_train$price, na.rm = TRUE)
med_price <- median(ames_train$price, na.rm = TRUE)
ames_train %>% ggplot(aes(x=price/10^5))+geom_histogram(colour="black", fill="green")+geom_vline(aes(xintercept = avg_price/10^5), color="red", linetype="dashed", size=1) + geom_vline(aes(xintercept = med_price/10^5), color="red", linetype="dashed", size=1) + annotate("text", x=c(avg_price/10^5, med_price/10^5), y=c(-4, -6), label=c("Mean", "Median"), colour=c("red", "blue"), size=3)+ labs(title = "Houses prices by hundred thousands histogram", x="Frequencies", y="prices in 100K")+theme(plot.title = element_text(size = rel(1), color = "blue", hjust = 0.5, vjust = 0))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ames_train %>% filter(is.na(Bsmt.Qual)) %>% nrow() %>% paste("houses without basement")
## [1] "21 houses without basement"
ames_train %>% filter(grepl("Grvl", Street, ignore.case=TRUE)) %>% nrow() %>% paste("houses in gravel streets")
## [1] "3 houses in gravel streets"
For the above question, we will proceed to an hypothesis testing with H0: Homes with garage have no larger footage than others => Avg_LotArea_with_grg - Avg_LotArea_without_grg = 0 HA: Homes with garage have larger footage than others => Avg_LotArea_with_grg - Avg_LotArea_without_grg > 0
alpha = 0.05
##Mean and standard deviation of both populations
grg_stats <- ames_train %>% group_by(is.na(Garage.Finish)) %>% summarise(Counts =n(),Std_dev = sd(Lot.Area, na.rm=TRUE), Avg = mean(Lot.Area, na.rm=TRUE))
## Standard Error of the difference
##degree of freedom is min(n1-1, n2-1)
se_grg_diff <- sqrt(grg_stats$Std_dev[1]^2/(grg_stats$Counts[1]-1)+grg_stats$Std_dev[2]^2/(grg_stats$Counts[2]-1))
Z_score <- (grg_stats$Avg[1]-grg_stats$Avg[2])/se_grg_diff
p_value <- pt(Z_score, min(grg_stats$Counts[1]-1, grg_stats$Counts[2]-1) ,lower.tail = FALSE)
if(p_value < alpha){
paste("With a p-value of approximately", round(p_value, 4) ,"We reject the null hypothesis of no difference")
} else{
paste("With a p-value of approximately", round(p_value, 4) ,"We fail to reject the null hypothesis of no difference")
}
## [1] "With a p-value of approximately 0 We reject the null hypothesis of no difference"
Lamda <- 3
std_dev1 <- 1
K_prior <- (Lamda/std_dev1)^2
Theta_prior <- std_dev1^2/Lamda
bedrooms <- ames_train %>% filter(!is.na(Lot.Area) > 2000 & !is.na(Bedroom.AbvGr) & Bedroom.AbvGr !=0) %>% summarise(Counts=n(),Mean=mean(Bedroom.AbvGr, na.rm=TRUE), Total=sum(Bedroom.AbvGr, na.rm=TRUE))
Theta_post <- Theta_prior/(bedrooms$Counts[1]*Theta_prior+1)
K_post <- K_prior + bedrooms$Total[1]
Lamda_post <- K_post*Theta_post
std_dev1_post <- Theta_post*sqrt(K_post)
c(Lamda_post, std_dev1_post)
## [1] 2.81218781 0.05300357
price) on \(\log\)(area), there are some outliers. Which of the following do the three most outlying points have in common?
ggplot(data = ames_train, aes(x=log(area), y=log(price), colour=Year.Built<1930))+geom_point()+geom_smooth(method = "lm", level=0.95)
price_area <- lm(formula = price ~ area, data = ames_train)
stats_price_area <- summary(price_area)
price if used as a dependent variable in a linear regression?
price is right-skewed.
price cannot take on negative values.
price can only take on integer values.par(mfrow=c(3,2))
hist(ames_train$price, breaks = 100)
hist(log(ames_train$price), breaks = 100)
#Linear model area by prices
area_price_lm <- lm(formula = area ~ price, data = ames_train)
area_price_lm_log <- lm(formula = area ~ log(price), data = ames_train)
plot(area_price_lm$residuals, area_price_lm$fitted.values)
plot(area_price_lm_log$residuals, area_price_lm_log$fitted.values)
qqnorm(area_price_lm$residuals)
qqline(area_price_lm$residuals)
qqnorm(area_price_lm_log$residuals)
qqline(area_price_lm_log$residuals)
Bldg.Type = 1Fam)
N_bldg_type <- ames_train %>% group_by(Neighborhood) %>% filter(tolower(Bldg.Type)=="1fam") %>% select(Neighborhood, Bldg.Type)%>% summarise(Counts=n())
N_count <- ames_train %>% group_by(Neighborhood)%>% select(Neighborhood, Bldg.Type) %>% summarise(Counts=n())
Bldg_neigh <- ames_train %>% select(Neighborhood, Bldg.Type) %>% xtabs(formula = ~ .)
Bldg_neigh
## Bldg.Type
## Neighborhood 1Fam 2fmCon Duplex Twnhs TwnhsE
## Blmngtn 2 0 0 0 9
## Blueste 0 0 0 2 1
## BrDale 0 0 0 9 1
## BrkSide 40 1 0 0 0
## ClearCr 13 0 0 0 0
## CollgCr 82 0 1 0 2
## Crawfor 25 1 1 0 2
## Edwards 48 1 5 1 5
## Gilbert 48 1 0 0 0
## Greens 0 0 0 1 3
## GrnHill 0 0 0 0 2
## IDOTRR 33 2 0 0 0
## Landmrk 0 0 0 0 0
## MeadowV 0 0 0 7 9
## Mitchel 35 1 4 1 3
## NAmes 143 1 10 0 1
## NoRidge 28 0 0 0 0
## NPkVill 0 0 0 3 1
## NridgHt 40 0 0 6 11
## NWAmes 40 0 1 0 0
## OldTown 58 11 2 0 0
## Sawyer 56 1 4 0 0
## SawyerW 36 0 6 0 4
## Somerst 47 0 0 8 19
## StoneBr 12 0 0 0 8
## SWISU 11 0 1 0 0
## Timber 19 0 0 0 0
## Veenker 7 0 0 0 3
ggplot(data = ames_train)+geom_mosaic(aes(weight=frequency(Bldg.Type), x=product(Neighborhood), fill=Bldg.Type), na.rm = TRUE)+theme(axis.text.x = element_text(angle = 90, hjust = 1))
area) and the number of bedrooms above ground (Bedroom.AbvGr)?
ggplot(data = ames_train, aes(y=log(area), x=Bedroom.AbvGr))+geom_point(aes(color=area, size=log(area)))+geom_smooth(method = "lm", level=0.95, col="red")
ames_train %>% filter(!is.na(Bsmt.Unf.SF) & Bsmt.Unf.SF!=0) %>% summarise(Count=n(), Means=mean(Bsmt.Unf.SF, na.rm=TRUE))