1) Before doing any graphing at all, extract a subset of the data by running this code. Based on your understanding of R, briefly explain what this code accomplishes. (10 Points)

#Loading in data and extract a subset of the data
df = read_excel('W16604-XLS-ENG.xlsx')
df_cleaned = df %>% filter(type == 'house')

2.a) Create a ggplot2 histogram of house sub-types. Approximately what proportion of houses are detached?

#change subtype column into factor
df_cleaned$subtype = as.factor(df_cleaned$subtype)

ggplot(df_cleaned, aes(x = subtype, fill = subtype))  +
  geom_histogram(stat = 'count') +
  labs(title = 'Histogram of House Sub-types', tag = '2.a)', x = 'House Subtype', y = 'Numbers of each type', caption = "*Source from '62 Listing Properties' of HomeZilla in W16604-XLS-ENG.xlsx, HBR") +
  theme(
plot.title = element_text(size=20, face="bold"),
axis.title.x = element_text(size=12, face="bold"),
axis.title.y = element_text(size=12, face="bold")
)
## Warning: Ignoring unknown parameters: binwidth, bins, pad

#Computation of the proportion of house detached
#AvgDetHouse = mean(df_cleaned$subtype == 'Single Family Detached')
#AvgDetHouse
- Based on the histogram showed, more than 50% of the houses are detached.

2.b) Create a ggplot2 histogram of bathrooms with different colours for house sub-types.

df_cleaned$bathrooms = as.factor(df_cleaned$bathrooms) 
df_cleaned %>%
  ggplot(aes(x = bathrooms, fill = subtype)) + 
  geom_histogram(stat = 'count', position = position_dodge2(preserve = "single")) +
  labs(title = 'Histogram of Bathrooms', tag = '2.b)', x = 'Bathroom', y = 'Numbers of each type', caption = "*Source from '62 Listing Properties' of HomeZilla in W16604-XLS-ENG.xlsx, HBR") +
  theme(
plot.title = element_text(size=20, face="bold"),
axis.title.x = element_text(size=12, face="bold"),
axis.title.y = element_text(size=12, face="bold")
)
## Warning: Ignoring unknown parameters: binwidth, bins, pad

2.c) Create a ggplot2 scatter-plot of how price varies with the area of the house. Add a line of best fit to this plot.

ggplot(df_cleaned, aes(x = sqfoot, y = price)) + 
  geom_point(shape = 1) + 
  stat_smooth(method = "lm", se = TRUE, col = 'red', linetype = 2) +
  labs(title = 'Price ~ Area Relationship', tag = '2.c)', x = 'Area', y = 'Price', caption = "*Source from '62 Listing Properties' of HomeZilla in W16604-XLS-ENG.xlsx, HBR") +
  theme(
plot.title = element_text(size=20, face="bold"),
axis.title.x = element_text(size=12, face="bold"),
axis.title.y = element_text(size=12, face="bold")
)