library(ggplot2)
# Read the dataset
df <- read.csv('https://opportunityinsights.org/wp-content/uploads/2018/10/tract_covariates.csv')
# Step 2: Create a new dataframe with only the specified variables
df1 <- df[, c('czname', 'hhinc_mean2000', 'popdensity2000')]
# Step 3: Create a new dataframe that only keeps the rows for San Antonio
df2 <- df1[df1$czname == 'San Antonio', ]
# Step 4: Make the histogram of household income for San Antonio
ggplot(df2, aes(x = hhinc_mean2000)) +
geom_histogram(binwidth = 5000, fill = "blue", color = "black") +
labs(title = "Household Income in San Antonio", x = "Household Income (2000)", y = "Count")
## Warning: Removed 2 rows containing non-finite values (`stat_bin()`).

# Step 5: Make the boxplot of population density for San Antonio
ggplot(df2, aes(y = popdensity2000)) +
geom_boxplot(fill = "orange", color = "black") +
labs(title = "Population Density in San Antonio", y = "Population Density (2000)")

# Step 6: Make the probability density function (PDF) of household income for San Antonio
ggplot(df2, aes(x = hhinc_mean2000)) +
geom_density(fill = "lightblue") +
labs(title = "Probability Density Function of Household Income", x = "Household Income (2000)", y = "Density")
## Warning: Removed 2 rows containing non-finite values (`stat_density()`).

# Step 7: Make the cumulative density function (CDF) of household income for San Antonio
ggplot(df2, aes(x = hhinc_mean2000)) +
stat_ecdf(geom = "step", color = "darkgreen") +
labs(title = "Cumulative Density Function of Household Income", x = "Household Income (2000)", y = "CDF")
## Warning: Removed 2 rows containing non-finite values (`stat_ecdf()`).

# Step 8: Make the scatter plot of population density and household income for San Antonio
ggplot(df2, aes(x = popdensity2000, y = hhinc_mean2000)) +
geom_point(color = "purple")
## Warning: Removed 2 rows containing missing values (`geom_point()`).

# Step 9: Add labels for X axis and Y axis to the scatter plot
ggplot(df2, aes(x = popdensity2000, y = hhinc_mean2000)) +
geom_point(color = "purple") +
labs(title = "Scatter Plot of Population Density and Household Income", x = "Population Density (2000)", y = "Household Income (2000)")
## Warning: Removed 2 rows containing missing values (`geom_point()`).

# Step 10: Make the scatter plot in Q8 interactive using the plotly library
# Install plotly
library(plotly)
## Warning: package 'plotly' was built under R version 4.3.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
p <- ggplot(df2, aes(x = popdensity2000, y = hhinc_mean2000)) +
geom_point(color = "purple") +
labs(title = "Scatter Plot of Population Density and Household Income", x = "Population Density (2000)", y = "Household Income (2000)")
# Convert the ggplot scatter plot to an interactive plot
ggplotly(p)