library(ggplot2)

#1 Read the full dataset that includes 38 variables and load libraries needed
tract_covariates <- read.csv ("D:/Documents/RFiles/Datasets/tract_covariates.csv")

#2 Create a new dataframe with only the following variables: “czname”, “hhinc_mean2000”, “popdensity2000”
#3 Following step 2, create a new dataframe that only keeps the rows for San Antonio only (i.e., czname==”San Antonio”)
df <- tract_covariates[tract_covariates$czname == "San Antonio", c("czname", "hhinc_mean2000", "popdensity2000")]


#4 Make the histogram of household income for San Antonio (variable name: hhinc_mean2000)

ggplot(df, aes(x = hhinc_mean2000, fill = czname)) + 
  geom_histogram() +
  labs(x = "Household Income", fill = "City")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).

#5 Make the boxplot of population density for San Antonio (variable name: popdensity2000)

ggplot(df, aes(x = czname, y = popdensity2000, fill = czname)) + 
  geom_boxplot()+
  labs(x = "City", y = "Population Density", fill = "City")

#6 Make the probability density function (PDF) of household income for San Antonio (variable name: hhinc_mean2000)

ggplot(df, aes(x = hhinc_mean2000, color = czname)) + 
  geom_density() +
  labs(x = "Household Income", color = "City")
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).

#7 Make the cumulative density function (CDF) of household income for San Antonio (variable name: hhinc_mean2000)
#9 Following Q7, add labels for X axis and Y axis

ggplot(df, aes(x = hhinc_mean2000, color = czname)) + 
  stat_ecdf(geom = "step") +
  labs(x = "Household Income",y= "Density", color = "City")
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_ecdf()`).

#8 Make the scatter plot of population density (variable name: popdensity2000) and household income (variable name: hhinc_mean2000) for San Antonio

ggplot(df, aes(x = hhinc_mean2000, y = popdensity2000)) +
  geom_point()+
  labs(x = "Household Income", y = "Population density")
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

#10 Make the scatter plot in Q8 interactive using library: plotly
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
p <- ggplot(df, aes(x = hhinc_mean2000, y = popdensity2000)) + 
  geom_point() +
  labs(x = "Household Income", y = "Population density")
ggplotly(p)