library(ggplot2)
#1 Read the full dataset that includes 38 variables and load libraries needed
tract_covariates <- read.csv ("D:/Documents/RFiles/Datasets/tract_covariates.csv")
#2 Create a new dataframe with only the following variables: “czname”, “hhinc_mean2000”, “popdensity2000”
#3 Following step 2, create a new dataframe that only keeps the rows for San Antonio only (i.e., czname==”San Antonio”)
df <- tract_covariates[tract_covariates$czname == "San Antonio", c("czname", "hhinc_mean2000", "popdensity2000")]
#4 Make the histogram of household income for San Antonio (variable name: hhinc_mean2000)
ggplot(df, aes(x = hhinc_mean2000, fill = czname)) +
geom_histogram() +
labs(x = "Household Income", fill = "City")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).

#5 Make the boxplot of population density for San Antonio (variable name: popdensity2000)
ggplot(df, aes(x = czname, y = popdensity2000, fill = czname)) +
geom_boxplot()+
labs(x = "City", y = "Population Density", fill = "City")

#6 Make the probability density function (PDF) of household income for San Antonio (variable name: hhinc_mean2000)
ggplot(df, aes(x = hhinc_mean2000, color = czname)) +
geom_density() +
labs(x = "Household Income", color = "City")
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).

#7 Make the cumulative density function (CDF) of household income for San Antonio (variable name: hhinc_mean2000)
#9 Following Q7, add labels for X axis and Y axis
ggplot(df, aes(x = hhinc_mean2000, color = czname)) +
stat_ecdf(geom = "step") +
labs(x = "Household Income",y= "Density", color = "City")
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_ecdf()`).

#8 Make the scatter plot of population density (variable name: popdensity2000) and household income (variable name: hhinc_mean2000) for San Antonio
ggplot(df, aes(x = hhinc_mean2000, y = popdensity2000)) +
geom_point()+
labs(x = "Household Income", y = "Population density")
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

#10 Make the scatter plot in Q8 interactive using library: plotly
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
p <- ggplot(df, aes(x = hhinc_mean2000, y = popdensity2000)) +
geom_point() +
labs(x = "Household Income", y = "Population density")
ggplotly(p)