Complete all Exercises, and submit answers to Questions on the Coursera platform.
library(dplyr)## Warning: package 'dplyr' was built under R version 3.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)## Warning: package 'ggplot2' was built under R version 3.3.3
library(ggmosaic)## Warning: package 'ggmosaic' was built under R version 3.3.3
## Loading required package: productplots
## Warning: package 'productplots' was built under R version 3.3.3
##
## Attaching package: 'ggmosaic'
## The following objects are masked from 'package:productplots':
##
## ddecker, hspine, mosaic, prodcalc, spine, vspine
This initial quiz will concern exploratory data analysis (EDA) of the Ames Housing dataset. EDA is essential when working with any source of data and helps inform modeling.
First, let us load the data:
load("ames_train.Rdata")Misc.Feature, Alley, Pool.QC
# type your code for Question 1 here, and Knit
## counting all missing values
missing_val <- ames_train %>% summarise_all(funs(sum(is.na(.))))## Warning: package 'bindrcpp' was built under R version 3.3.3
## sorting the top 3 missing values. Unlist helps converting to vector with variable names as names
head(sort(unlist(missing_val[1,]), decreasing = TRUE), 3)## Pool.QC Misc.Feature Alley
## 997 971 933
##int? Change them to factors when conducting your analysis.
# type your code for Question 2 here, and Knit
## Splitting the data variables by type numerical or categorical
ames_var_types <- split(names(ames_train), sapply(ames_train, function(x) paste(class(x), collapse = " ")))
ames_var_types$integer## [1] "PID" "area" "price"
## [4] "MS.SubClass" "Lot.Frontage" "Lot.Area"
## [7] "Overall.Qual" "Overall.Cond" "Year.Built"
## [10] "Year.Remod.Add" "Mas.Vnr.Area" "BsmtFin.SF.1"
## [13] "BsmtFin.SF.2" "Bsmt.Unf.SF" "Total.Bsmt.SF"
## [16] "X1st.Flr.SF" "X2nd.Flr.SF" "Low.Qual.Fin.SF"
## [19] "Bsmt.Full.Bath" "Bsmt.Half.Bath" "Full.Bath"
## [22] "Half.Bath" "Bedroom.AbvGr" "Kitchen.AbvGr"
## [25] "TotRms.AbvGrd" "Fireplaces" "Garage.Yr.Blt"
## [28] "Garage.Cars" "Garage.Area" "Wood.Deck.SF"
## [31] "Open.Porch.SF" "Enclosed.Porch" "X3Ssn.Porch"
## [34] "Screen.Porch" "Pool.Area" "Misc.Val"
## [37] "Mo.Sold" "Yr.Sold"
sapply(ames_var_types$integer, function(y) nrow(unique(select(ames_train, y))))## PID area price MS.SubClass
## 1000 686 535 15
## Lot.Frontage Lot.Area Overall.Qual Overall.Cond
## 105 785 10 9
## Year.Built Year.Remod.Add Mas.Vnr.Area BsmtFin.SF.1
## 102 61 259 529
## BsmtFin.SF.2 Bsmt.Unf.SF Total.Bsmt.SF X1st.Flr.SF
## 106 606 594 624
## X2nd.Flr.SF Low.Qual.Fin.SF Bsmt.Full.Bath Bsmt.Half.Bath
## 287 12 5 4
## Full.Bath Half.Bath Bedroom.AbvGr Kitchen.AbvGr
## 5 3 7 3
## TotRms.AbvGrd Fireplaces Garage.Yr.Blt Garage.Cars
## 12 5 94 7
## Garage.Area Wood.Deck.SF Open.Porch.SF Enclosed.Porch
## 371 215 176 101
## X3Ssn.Porch Screen.Porch Pool.Area Misc.Val
## 15 57 4 18
## Mo.Sold Yr.Sold
## 12 5
All variable related to the year were clasified as numerical variables but will be considered categorical for instance Year.Built & Year.Remod.Add, Garage.Yr.Blt and Yr.Sold. The first two variables Year.Built & Year.Remod.Add will most likely be merged if the remodel date is considered as the new life for the house.
StoneBr `
# type your code for Question 3 here, and Knit
ames_train %>% select(Neighborhood, price) %>% group_by(Neighborhood) %>% summarise(Avg_price = mean(price, na.rm=TRUE), std_dev = sd(price, na.rm=TRUE)) %>% arrange(desc(std_dev))## # A tibble: 27 x 3
## Neighborhood Avg_price std_dev
## <fctr> <dbl> <dbl>
## 1 StoneBr 339316.0 123459.10
## 2 NridgHt 333646.7 105088.90
## 3 Timber 265192.2 84029.57
## 4 Veenker 233650.0 72545.41
## 5 Crawfor 204196.6 71267.56
## 6 GrnHill 280000.0 70710.68
## 7 Somerst 234595.9 65199.49
## 8 Edwards 136322.0 54851.63
## 9 CollgCr 196950.9 52786.08
## 10 SawyerW 183101.0 48354.36
## # ... with 17 more rows
ggplot(ames_train, aes(x=Neighborhood, y=price, fill=Neighborhood))+geom_boxplot() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) Both summary statistics and graphical plots shows that StoneBr neighborhood has the highest standard deviation in terms of house price.
price?
Overall.Qual
# type your code for Question 4 here, and Knit
p <- ggplot(data = ames_train, aes(x=Lot.Area, y=price/100000))
p+geom_point()+ stat_smooth(method = "lm", col="red") +labs(title="Prices by Lot Area", x="Lot Area", y="Prices in 100K") + theme(plot.title = element_text(size = rel(2), color = "blue", hjust = 0.5, vjust = 0))ggplot(data = ames_train, aes(x=Bedroom.AbvGr, y=price/100000))+geom_point()+ stat_smooth(method = "lm", col="red") +labs(title="Prices by houses with Bedroom Above Grade", x="Bedroom Above Grade", y="Prices in 100K") + theme(plot.title = element_text(size = rel(2), color = "blue", hjust = 0.5, vjust = 0))ggplot(data = ames_train, aes(x=Overall.Qual, y=price/100000))+geom_point()+ stat_smooth(method = "lm", col="red") +labs(title="Prices by houses overall quality", x="Overall quality", y="Prices in 100K") + theme(plot.title = element_text(size = rel(2), color = "blue", hjust = 0.5, vjust = 0))ames_train %>% filter(Overall.Qual==9, price<200000)## # A tibble: 1 x 81
## PID area price MS.SubClass MS.Zoning Lot.Frontage Lot.Area
## <int> <int> <int> <int> <fctr> <int> <int>
## 1 533350090 2944 150000 60 RL NA 24572
## # ... with 74 more variables: Street <fctr>, Alley <fctr>,
## # Lot.Shape <fctr>, Land.Contour <fctr>, Utilities <fctr>,
## # Lot.Config <fctr>, Land.Slope <fctr>, Neighborhood <fctr>,
## # Condition.1 <fctr>, Condition.2 <fctr>, Bldg.Type <fctr>,
## # House.Style <fctr>, Overall.Qual <int>, Overall.Cond <int>,
## # Year.Built <int>, Year.Remod.Add <int>, Roof.Style <fctr>,
## # Roof.Matl <fctr>, Exterior.1st <fctr>, Exterior.2nd <fctr>,
## # Mas.Vnr.Type <fctr>, Mas.Vnr.Area <int>, Exter.Qual <fctr>,
## # Exter.Cond <fctr>, Foundation <fctr>, Bsmt.Qual <fctr>,
## # Bsmt.Cond <fctr>, Bsmt.Exposure <fctr>, BsmtFin.Type.1 <fctr>,
## # BsmtFin.SF.1 <int>, BsmtFin.Type.2 <fctr>, BsmtFin.SF.2 <int>,
## # Bsmt.Unf.SF <int>, Total.Bsmt.SF <int>, Heating <fctr>,
## # Heating.QC <fctr>, Central.Air <fctr>, Electrical <fctr>,
## # X1st.Flr.SF <int>, X2nd.Flr.SF <int>, Low.Qual.Fin.SF <int>,
## # Bsmt.Full.Bath <int>, Bsmt.Half.Bath <int>, Full.Bath <int>,
## # Half.Bath <int>, Bedroom.AbvGr <int>, Kitchen.AbvGr <int>,
## # Kitchen.Qual <fctr>, TotRms.AbvGrd <int>, Functional <fctr>,
## # Fireplaces <int>, Fireplace.Qu <fctr>, Garage.Type <fctr>,
## # Garage.Yr.Blt <int>, Garage.Finish <fctr>, Garage.Cars <int>,
## # Garage.Area <int>, Garage.Qual <fctr>, Garage.Cond <fctr>,
## # Paved.Drive <fctr>, Wood.Deck.SF <int>, Open.Porch.SF <int>,
## # Enclosed.Porch <int>, X3Ssn.Porch <int>, Screen.Porch <int>,
## # Pool.Area <int>, Pool.QC <fctr>, Fence <fctr>, Misc.Feature <fctr>,
## # Misc.Val <int>, Mo.Sold <int>, Yr.Sold <int>, Sale.Type <fctr>,
## # Sale.Condition <fctr>
ggplot(data = ames_train, aes(x=Year.Built, y=price/100000))+geom_point()+ stat_smooth(method = "lm", col="red") +labs(title="Prices by houses built year", x="Year built", y="Prices in 100K") + theme(plot.title = element_text(size = rel(2), color = "blue", hjust = 0.5, vjust = 0))Based on all the above plots, we house overall quality seems to be the best predictors compare to other variables mentioned in this section.
price and area. Which of the following variable transformations makes the relationship appear to be the most linear?
price and area
# type your code for Question 5 here, and Knit
par(mfrow=c(2,2))
p <- ggplot(data = ames_train, aes(x=Lot.Area, y=price/100000))
p+geom_point()+ stat_smooth(method = "lm", col="red") +labs(title="Prices by Lot Area", x="Lot Area", y="Prices in 100K") + theme(plot.title = element_text(size = rel(2), color = "blue", hjust = 0.5, vjust = 0))p <- ggplot(data = ames_train, aes(x=log(Lot.Area), y=price/100000))
p+geom_point()+ stat_smooth(method = "lm", col="red") +labs(title="Prices by Log Lot Area", x="Lot Area with Log transformation", y="Prices in 100K") + theme(plot.title = element_text(size = rel(2), color = "blue", hjust = 0.5, vjust = 0))p <- ggplot(data = ames_train, aes(x=log(Lot.Area), y=log(price/100000)))
p+geom_point()+ stat_smooth(method = "lm", col="red") +labs(title="Log Prices by Log Lot Area", x="Lot Area with Log transformation", y="Log Prices in 100K") + theme(plot.title = element_text(size = rel(2), color = "blue", hjust = 0.5, vjust = 0))p <- ggplot(data = ames_train, aes(x=Lot.Area, y=log(price/100000)))
p+geom_point()+ stat_smooth(method = "lm", col="red") +labs(title="Log Prices by Lot Area", x="Lot Area", y="Log Prices in 100K") + theme(plot.title = element_text(size = rel(2), color = "blue", hjust = 0.5, vjust = 0))# type your code for Question 6 here, and Knit
alpha <- ames_train %>% filter(!is.na(Garage.Type)) %>% nrow()
beta <- nrow(ames_train)-alpha
paste("Our posterior beta will be: beta(", alpha+9, "," ,beta+1,")", sep = "", collapse = " ")## [1] "Our posterior beta will be: beta(963,47)"
# type your code for Question 7 here, and Knit
## Percentage of housing price with buil year >1999
n <- nrow(ames_train)
ames_train %>% group_by(Year.Built>1999) %>% summarise(Yr_built=paste(round(100*n()/n, 2), "%"))## # A tibble: 2 x 2
## `Year.Built > 1999` Yr_built
## <lgl> <chr>
## 1 FALSE 72.8 %
## 2 TRUE 27.2 %
avg_price <- mean(ames_train$price, na.rm = TRUE)
med_price <- median(ames_train$price, na.rm = TRUE)
ames_train %>% ggplot(aes(x=price/10^5))+geom_histogram(colour="black", fill="green")+geom_vline(aes(xintercept = avg_price/10^5), color="red", linetype="dashed", size=1) + geom_vline(aes(xintercept = med_price/10^5), color="red", linetype="dashed", size=1) + annotate("text", x=c(avg_price/10^5, med_price/10^5), y=c(-4, -6), label=c("Mean", "Median"), colour=c("red", "blue"), size=3)+ labs(title = "Houses prices by hundred thousands histogram", x="Frequencies", y="prices in 100K")+theme(plot.title = element_text(size = rel(1), color = "blue", hjust = 0.5, vjust = 0))## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Counting the houses with no basement
ames_train %>% filter(is.na(Bsmt.Qual)) %>% nrow() %>% paste("houses without basement")## [1] "21 houses without basement"
## Counting the house located on gravel roads
ames_train %>% filter(grepl("Grvl", Street, ignore.case=TRUE)) %>% nrow() %>% paste("houses in gravel streets")## [1] "3 houses in gravel streets"
For the above question, we will proceed to an hypothesis testing with H0: Homes with garage have no larger footage than others => Avg_LotArea_with_grg - Avg_LotArea_without_grg = 0 HA: Homes with garage have larger footage than others => Avg_LotArea_with_grg - Avg_LotArea_without_grg > 0
# type your code for Question 8 here, and Knit
alpha = 0.05
##Mean and standard deviation of both populations
grg_stats <- ames_train %>% group_by(is.na(Garage.Finish)) %>% summarise(Counts =n(),Std_dev = sd(Lot.Area, na.rm=TRUE), Avg = mean(Lot.Area, na.rm=TRUE))
## Standard Error of the difference
##degree of freedom is min(n1-1, n2-1)
se_grg_diff <- sqrt(grg_stats$Std_dev[1]^2/(grg_stats$Counts[1]-1)+grg_stats$Std_dev[2]^2/(grg_stats$Counts[2]-1))
Z_score <- (grg_stats$Avg[1]-grg_stats$Avg[2])/se_grg_diff
p_value <- pt(Z_score, min(grg_stats$Counts[1]-1, grg_stats$Counts[2]-1) ,lower.tail = FALSE)
if(p_value < alpha){
paste("With a p-value of approximately", round(p_value, 4) ,"We reject the null hypothesis of no difference")
} else{
paste("With a p-value of approximately", round(p_value, 4) ,"We fail to reject the null hypothesis of no difference")
}## [1] "With a p-value of approximately 0 We reject the null hypothesis of no difference"
# type your code for Question 9 here, and Knit
Lamda <- 3
std_dev1 <- 1
K_prior <- (Lamda/std_dev1)^2
Theta_prior <- std_dev1^2/Lamda
bedrooms <- ames_train %>% filter(!is.na(Lot.Area) > 2000 & !is.na(Bedroom.AbvGr) & Bedroom.AbvGr !=0) %>% summarise(Counts=n(),Mean=mean(Bedroom.AbvGr, na.rm=TRUE), Total=sum(Bedroom.AbvGr, na.rm=TRUE))
Theta_post <- Theta_prior/(bedrooms$Counts[1]*Theta_prior+1)
K_post <- K_prior + bedrooms$Total[1]
Lamda_post <- K_post*Theta_post
std_dev1_post <- Theta_post*sqrt(K_post)
c(Lamda_post, std_dev1_post)## [1] 2.81218781 0.05300357
price) on \(\log\)(area), there are some outliers. Which of the following do the three most outlying points have in common?
# type your code for Question 10 here, and Knit
ggplot(data = ames_train, aes(x=log(area), y=log(price), colour=Year.Built<1930))+geom_point()+geom_smooth(method = "lm", level=0.95)price_area <- lm(formula = price ~ area, data = ames_train)
stats_price_area <- summary(price_area)price if used as a dependent variable in a linear regression?
price is right-skewed.
# type your code for Question 11 here, and Knit
par(mfrow=c(3,2))
hist(ames_train$price, breaks = 100)
hist(log(ames_train$price), breaks = 100)
#Linear model area by prices
area_price_lm <- lm(formula = area ~ price, data = ames_train)
area_price_lm_log <- lm(formula = area ~ log(price), data = ames_train)
plot(area_price_lm$residuals, area_price_lm$fitted.values)
plot(area_price_lm_log$residuals, area_price_lm_log$fitted.values)
qqnorm(area_price_lm$residuals)
qqline(area_price_lm$residuals)
qqnorm(area_price_lm_log$residuals)
qqline(area_price_lm_log$residuals) We can see from the two plots that the one without logarythmic transformation is right skewed and its axis is difficult to read due to scaling. While the one with the log transformation is nearly normal and is has a more readable axis. Hence log transformation is useful for scaling the variable transformed as well as normalizing skewed data while keeping the same properties as the original distribution.
Bldg.Type = 1Fam)
# type your code for Question 12 here, and Knit
N_bldg_type <- ames_train %>% group_by(Neighborhood) %>% filter(tolower(Bldg.Type)=="1fam") %>% select(Neighborhood, Bldg.Type)%>% summarise(Counts=n())
N_count <- ames_train %>% group_by(Neighborhood)%>% select(Neighborhood, Bldg.Type) %>% summarise(Counts=n())
Bldg_neigh <- ames_train %>% select(Neighborhood, Bldg.Type) %>% xtabs(formula = ~ .)
Bldg_neigh## Bldg.Type
## Neighborhood 1Fam 2fmCon Duplex Twnhs TwnhsE
## Blmngtn 2 0 0 0 9
## Blueste 0 0 0 2 1
## BrDale 0 0 0 9 1
## BrkSide 40 1 0 0 0
## ClearCr 13 0 0 0 0
## CollgCr 82 0 1 0 2
## Crawfor 25 1 1 0 2
## Edwards 48 1 5 1 5
## Gilbert 48 1 0 0 0
## Greens 0 0 0 1 3
## GrnHill 0 0 0 0 2
## IDOTRR 33 2 0 0 0
## Landmrk 0 0 0 0 0
## MeadowV 0 0 0 7 9
## Mitchel 35 1 4 1 3
## NAmes 143 1 10 0 1
## NoRidge 28 0 0 0 0
## NPkVill 0 0 0 3 1
## NridgHt 40 0 0 6 11
## NWAmes 40 0 1 0 0
## OldTown 58 11 2 0 0
## Sawyer 56 1 4 0 0
## SawyerW 36 0 6 0 4
## Somerst 47 0 0 8 19
## StoneBr 12 0 0 0 8
## SWISU 11 0 1 0 0
## Timber 19 0 0 0 0
## Veenker 7 0 0 0 3
ggplot(data = ames_train)+geom_mosaic(aes(weight=frequency(Bldg.Type), x=product(Neighborhood), fill=Bldg.Type), na.rm = TRUE)+theme(axis.text.x = element_text(angle = 90, hjust = 1))area) and the number of bedrooms above ground (Bedroom.AbvGr)?
# type your code for Question 13 here, and Knit
ggplot(data = ames_train, aes(y=log(area), x=Bedroom.AbvGr))+geom_point(aes(color=area, size=log(area)))+geom_smooth(method = "lm", level=0.95, col="red")# type your code for Question 14 here, and Knit
ames_train %>% filter(!is.na(Bsmt.Unf.SF) & Bsmt.Unf.SF!=0) %>% summarise(Count=n(), Means=mean(Bsmt.Unf.SF, na.rm=TRUE))## # A tibble: 1 x 2
## Count Means
## <int> <dbl>
## 1 918 595.2527