library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
# Read the dataset
url <- "https://opportunityinsights.org/wp-content/uploads/2018/10/tract_covariates.csv"
census_data <- read.csv(url)

# Check the structure of the dataset
str(census_data)
## 'data.frame':    74044 obs. of  38 variables:
##  $ state                       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ county                      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ tract                       : int  20100 20200 20300 20400 20500 20600 20700 20801 20802 20900 ...
##  $ cz                          : int  11101 11101 11101 11101 11101 11101 11101 11101 11101 11101 ...
##  $ czname                      : chr  "Montgomery" "Montgomery" "Montgomery" "Montgomery" ...
##  $ hhinc_mean2000              : num  68639 57243 75648 74852 96175 ...
##  $ mean_commutetime2000        : num  26.2 24.8 25.3 23 26.2 ...
##  $ frac_coll_plus2000          : num  0.156 0.147 0.224 0.23 0.321 ...
##  $ frac_coll_plus2010          : num  0.254 0.267 0.164 0.253 0.375 ...
##  $ foreign_share2010           : num  0.00995 0.01634 0.0271 0.01508 0.04649 ...
##  $ med_hhinc1990               : num  27375 19000 29419 37891 41516 ...
##  $ med_hhinc2016               : int  66000 41107 51250 52704 52463 63750 45234 74603 61242 44591 ...
##  $ popdensity2000              : num  196 566 624 714 530 ...
##  $ poor_share2010              : num  0.105 0.1476 0.0804 0.0632 0.0596 ...
##  $ poor_share2000              : num  0.1268 0.2271 0.0766 0.0455 0.0368 ...
##  $ poor_share1990              : num  0.0989 0.1983 0.114 0.0679 0.0547 ...
##  $ share_white2010             : num  0.837 0.389 0.752 0.919 0.784 ...
##  $ share_black2010             : num  0.1192 0.565 0.198 0.0467 0.1397 ...
##  $ share_hisp2010              : num  0.023 0.0346 0.0258 0.0194 0.033 ...
##  $ share_asian2010             : num  0.00471 0.0023 0.00474 0.00365 0.02603 ...
##  $ share_black2000             : num  0.0755 0.6221 0.1491 0.0259 0.0601 ...
##  $ share_white2000             : num  0.897 0.355 0.82 0.938 0.897 ...
##  $ share_hisp2000              : num  0.00625 0.00846 0.01647 0.02217 0.01573 ...
##  $ share_asian2000             : num  0.00364 0.00317 0.00389 0.00729 0.0106 ...
##  $ gsmn_math_g3_2013           : num  2.76 2.76 2.76 2.76 2.76 ...
##  $ rent_twobed2015             : int  NA 907 583 713 923 765 645 532 671 710 ...
##  $ singleparent_share2010      : num  0.114 0.488 0.228 0.228 0.26 ...
##  $ singleparent_share1990      : num  0.1812 0.3525 0.1259 0.1268 0.0744 ...
##  $ singleparent_share2000      : num  0.251 0.393 0.245 0.191 0.168 ...
##  $ traveltime15_2010           : num  0.273 0.152 0.206 0.351 0.25 ...
##  $ emp2000                     : num  0.567 0.493 0.579 0.597 0.661 ...
##  $ mail_return_rate2010        : num  83.5 81.3 79.5 83.5 77.3 ...
##  $ ln_wage_growth_hs_grad      : num  0.0382 0.0893 -0.1777 -0.0723 -0.0961 ...
##  $ jobs_total_5mi_2015         : int  10109 9948 10387 12933 12933 9193 11578 1567 2615 279 ...
##  $ jobs_highpay_5mi_2015       : int  3396 3328 3230 3635 3635 3052 3389 989 1176 106 ...
##  $ popdensity2010              : num  505 1682 1633 1780 2446 ...
##  $ ann_avg_job_growth_2004_2013: num  -0.00677 -0.00425 0.01422 -0.01984 0.01863 ...
##  $ job_density_2013            : num  92.1 971.3 340.9 207.4 800.3 ...
# Create a new dataframe with selected variables
selected_data <- census_data %>%
  select(czname, hhinc_mean2000, popdensity2000)

# View the new dataframe
head(selected_data)
##       czname hhinc_mean2000 popdensity2000
## 1 Montgomery       68638.73       195.7238
## 2 Montgomery       57242.51       566.3814
## 3 Montgomery       75647.73       624.1968
## 4 Montgomery       74852.05       713.8040
## 5 Montgomery       96174.77       529.9303
## 6 Montgomery       68095.77       408.3740
# Create a new dataframe for San Antonio only
san_antonio_data <- selected_data %>%
  filter(czname == "San Antonio")

# View the filtered dataframe
head(san_antonio_data)
##        czname hhinc_mean2000 popdensity2000
## 1 San Antonio       60732.74       6.315087
## 2 San Antonio       64234.03      21.576385
## 3 San Antonio       60457.78      22.751017
## 4 San Antonio       44444.81     158.473980
## 5 San Antonio       64706.38      38.283703
## 6 San Antonio       64270.25     122.809260
sum(is.na(san_antonio_data$popdensity2000)) 
## [1] 0
sum(is.na(san_antonio_data$hhinc_mean2000)) 
## [1] 2
san_antonio_data <- na.omit(san_antonio_data)

# Histogram of household income for San Antonio
ggplot(san_antonio_data, aes(x = hhinc_mean2000)) +
  geom_histogram(binwidth = 5000, fill = "orange", color = "black") +
  labs(title = "Household Income Distribution in San Antonio",
       x = "Household Income (2000)",
       y = "Frequency")

# Boxplot of population density for San Antonio
ggplot(san_antonio_data, aes(y = popdensity2000)) +
  geom_boxplot(fill = "purple") +
  labs(title = "Population Density in San Antonio",
       y = "Population Density (2000)")

# Probability Density Function of household income
ggplot(san_antonio_data, aes(x = hhinc_mean2000)) +
  geom_density(fill = "purple", alpha = 0.7) +
  labs(title = "Probability Density Function of Household Income",
       x = "Household Income (2000)",
       y = "Density")

# Cumulative Density Function of household income
ggplot(san_antonio_data, aes(x = hhinc_mean2000)) +
  stat_ecdf(geom = "step", color = "red") +
  labs(title = "Cumulative Density Function of Household Income",
       x = "Household Income (2000)",
       y = "Cumulative Probability")

# Scatter plot of population density vs household income
ggplot(san_antonio_data, aes(x = popdensity2000, y = hhinc_mean2000)) +
  geom_point(color = "darkblue") +
  labs(title = "Scatter Plot of Population Density vs. Household Income",
       x = "Population Density (2000)",
       y = "Household Income (2000)")

# Create an interactive scatter plot using plotly with specified trace type
interactive_plot <- plot_ly(data = san_antonio_data, 
                             x = ~popdensity2000, 
                             y = ~hhinc_mean2000, 
                             type = 'scatter',  # Specify trace type
                             mode = 'markers', 
                             marker = list(color = 'blue')) %>%
  layout(title = "Interactive Scatter Plot of Population Density vs. Household Income",
         xaxis = list(title = "Population Density (2000)"),
         yaxis = list(title = "Household Income (2000)"))

# Show the interactive plot
interactive_plot