options(repos = c(CRAN = "https://cloud.r-project.org"))

install.packages(c("readr", "dplyr"))
## 
## The downloaded binary packages are in
##  /var/folders/4b/1p0sp0rs33xg19wq8_5j5lm40000gn/T//Rtmp7I3wF8/downloaded_packages
library(readr)  
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
url <- "https://opportunityinsights.org/wp-content/uploads/2018/10/tract_covariates.csv"
data <- read_csv(url)
## Rows: 74044 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): czname
## dbl (37): state, county, tract, cz, hhinc_mean2000, mean_commutetime2000, fr...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# View the first few rows of the dataset
head(data)
## # A tibble: 6 × 38
##   state county tract    cz czname     hhinc_mean2000 mean_commutetime2000
##   <dbl>  <dbl> <dbl> <dbl> <chr>               <dbl>                <dbl>
## 1     1      1 20100 11101 Montgomery         68639.                 26.2
## 2     1      1 20200 11101 Montgomery         57243.                 24.8
## 3     1      1 20300 11101 Montgomery         75648.                 25.3
## 4     1      1 20400 11101 Montgomery         74852.                 23.0
## 5     1      1 20500 11101 Montgomery         96175.                 26.2
## 6     1      1 20600 11101 Montgomery         68096.                 21.6
## # ℹ 31 more variables: frac_coll_plus2000 <dbl>, frac_coll_plus2010 <dbl>,
## #   foreign_share2010 <dbl>, med_hhinc1990 <dbl>, med_hhinc2016 <dbl>,
## #   popdensity2000 <dbl>, poor_share2010 <dbl>, poor_share2000 <dbl>,
## #   poor_share1990 <dbl>, share_white2010 <dbl>, share_black2010 <dbl>,
## #   share_hisp2010 <dbl>, share_asian2010 <dbl>, share_black2000 <dbl>,
## #   share_white2000 <dbl>, share_hisp2000 <dbl>, share_asian2000 <dbl>,
## #   gsmn_math_g3_2013 <dbl>, rent_twobed2015 <dbl>, …
# Check the structure of the dataset
str(data)
## spc_tbl_ [74,044 × 38] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ state                       : num [1:74044] 1 1 1 1 1 1 1 1 1 1 ...
##  $ county                      : num [1:74044] 1 1 1 1 1 1 1 1 1 1 ...
##  $ tract                       : num [1:74044] 20100 20200 20300 20400 20500 ...
##  $ cz                          : num [1:74044] 11101 11101 11101 11101 11101 ...
##  $ czname                      : chr [1:74044] "Montgomery" "Montgomery" "Montgomery" "Montgomery" ...
##  $ hhinc_mean2000              : num [1:74044] 68639 57243 75648 74852 96175 ...
##  $ mean_commutetime2000        : num [1:74044] 26.2 24.8 25.3 23 26.2 ...
##  $ frac_coll_plus2000          : num [1:74044] 0.156 0.147 0.224 0.23 0.321 ...
##  $ frac_coll_plus2010          : num [1:74044] 0.254 0.267 0.164 0.253 0.375 ...
##  $ foreign_share2010           : num [1:74044] 0.00995 0.01634 0.0271 0.01508 0.04649 ...
##  $ med_hhinc1990               : num [1:74044] 27375 19000 29419 37891 41516 ...
##  $ med_hhinc2016               : num [1:74044] 66000 41107 51250 52704 52463 ...
##  $ popdensity2000              : num [1:74044] 196 566 624 714 530 ...
##  $ poor_share2010              : num [1:74044] 0.105 0.1476 0.0804 0.0632 0.0596 ...
##  $ poor_share2000              : num [1:74044] 0.1268 0.2271 0.0766 0.0455 0.0368 ...
##  $ poor_share1990              : num [1:74044] 0.0989 0.1983 0.114 0.0679 0.0547 ...
##  $ share_white2010             : num [1:74044] 0.837 0.389 0.752 0.919 0.784 ...
##  $ share_black2010             : num [1:74044] 0.1192 0.565 0.198 0.0467 0.1397 ...
##  $ share_hisp2010              : num [1:74044] 0.023 0.0346 0.0258 0.0194 0.033 ...
##  $ share_asian2010             : num [1:74044] 0.00471 0.0023 0.00474 0.00365 0.02603 ...
##  $ share_black2000             : num [1:74044] 0.0755 0.6221 0.1491 0.0259 0.0601 ...
##  $ share_white2000             : num [1:74044] 0.897 0.355 0.82 0.938 0.897 ...
##  $ share_hisp2000              : num [1:74044] 0.00625 0.00846 0.01647 0.02217 0.01573 ...
##  $ share_asian2000             : num [1:74044] 0.00364 0.00317 0.00389 0.00729 0.0106 ...
##  $ gsmn_math_g3_2013           : num [1:74044] 2.76 2.76 2.76 2.76 2.76 ...
##  $ rent_twobed2015             : num [1:74044] NA 907 583 713 923 765 645 532 671 710 ...
##  $ singleparent_share2010      : num [1:74044] 0.114 0.488 0.228 0.228 0.26 ...
##  $ singleparent_share1990      : num [1:74044] 0.1812 0.3525 0.1259 0.1268 0.0744 ...
##  $ singleparent_share2000      : num [1:74044] 0.251 0.393 0.245 0.191 0.168 ...
##  $ traveltime15_2010           : num [1:74044] 0.273 0.152 0.206 0.351 0.25 ...
##  $ emp2000                     : num [1:74044] 0.567 0.493 0.579 0.597 0.661 ...
##  $ mail_return_rate2010        : num [1:74044] 83.5 81.3 79.5 83.5 77.3 ...
##  $ ln_wage_growth_hs_grad      : num [1:74044] 0.0382 0.0893 -0.1777 -0.0723 -0.0961 ...
##  $ jobs_total_5mi_2015         : num [1:74044] 10109 9948 10387 12933 12933 ...
##  $ jobs_highpay_5mi_2015       : num [1:74044] 3396 3328 3230 3635 3635 ...
##  $ popdensity2010              : num [1:74044] 505 1682 1633 1780 2446 ...
##  $ ann_avg_job_growth_2004_2013: num [1:74044] -0.00677 -0.00425 0.01422 -0.01984 0.01863 ...
##  $ job_density_2013            : num [1:74044] 92.1 971.3 340.9 207.4 800.3 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   state = col_double(),
##   ..   county = col_double(),
##   ..   tract = col_double(),
##   ..   cz = col_double(),
##   ..   czname = col_character(),
##   ..   hhinc_mean2000 = col_double(),
##   ..   mean_commutetime2000 = col_double(),
##   ..   frac_coll_plus2000 = col_double(),
##   ..   frac_coll_plus2010 = col_double(),
##   ..   foreign_share2010 = col_double(),
##   ..   med_hhinc1990 = col_double(),
##   ..   med_hhinc2016 = col_double(),
##   ..   popdensity2000 = col_double(),
##   ..   poor_share2010 = col_double(),
##   ..   poor_share2000 = col_double(),
##   ..   poor_share1990 = col_double(),
##   ..   share_white2010 = col_double(),
##   ..   share_black2010 = col_double(),
##   ..   share_hisp2010 = col_double(),
##   ..   share_asian2010 = col_double(),
##   ..   share_black2000 = col_double(),
##   ..   share_white2000 = col_double(),
##   ..   share_hisp2000 = col_double(),
##   ..   share_asian2000 = col_double(),
##   ..   gsmn_math_g3_2013 = col_double(),
##   ..   rent_twobed2015 = col_double(),
##   ..   singleparent_share2010 = col_double(),
##   ..   singleparent_share1990 = col_double(),
##   ..   singleparent_share2000 = col_double(),
##   ..   traveltime15_2010 = col_double(),
##   ..   emp2000 = col_double(),
##   ..   mail_return_rate2010 = col_double(),
##   ..   ln_wage_growth_hs_grad = col_double(),
##   ..   jobs_total_5mi_2015 = col_double(),
##   ..   jobs_highpay_5mi_2015 = col_double(),
##   ..   popdensity2010 = col_double(),
##   ..   ann_avg_job_growth_2004_2013 = col_double(),
##   ..   job_density_2013 = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
# Check the dimensions (number of rows and columns)
dim(data)
## [1] 74044    38
# Create a new dataframe with only the selected variables
df_selected <- data[, c("czname", "hhinc_mean2000", "popdensity2000")]

# View the first few rows of the new dataframe
head(df_selected)
## # A tibble: 6 × 3
##   czname     hhinc_mean2000 popdensity2000
##   <chr>               <dbl>          <dbl>
## 1 Montgomery         68639.           196.
## 2 Montgomery         57243.           566.
## 3 Montgomery         75648.           624.
## 4 Montgomery         74852.           714.
## 5 Montgomery         96175.           530.
## 6 Montgomery         68096.           408.
library(dplyr)

# Create a new dataframe for San Antonio only
san_antonio_data <- data %>% filter(czname == "San Antonio")

# View the first few rows of the San Antonio dataframe
head(san_antonio_data)
## # A tibble: 6 × 38
##   state county  tract    cz czname      hhinc_mean2000 mean_commutetime2000
##   <dbl>  <dbl>  <dbl> <dbl> <chr>                <dbl>                <dbl>
## 1    48     13 960100 31301 San Antonio         60733.                 34.6
## 2    48     13 960201 31301 San Antonio         64234.                 38.2
## 3    48     13 960202 31301 San Antonio         60458.                 40.8
## 4    48     13 960300 31301 San Antonio         44445.                 32.7
## 5    48     13 960401 31301 San Antonio         64706.                 38.6
## 6    48     13 960402 31301 San Antonio         64270.                 28.6
## # ℹ 31 more variables: frac_coll_plus2000 <dbl>, frac_coll_plus2010 <dbl>,
## #   foreign_share2010 <dbl>, med_hhinc1990 <dbl>, med_hhinc2016 <dbl>,
## #   popdensity2000 <dbl>, poor_share2010 <dbl>, poor_share2000 <dbl>,
## #   poor_share1990 <dbl>, share_white2010 <dbl>, share_black2010 <dbl>,
## #   share_hisp2010 <dbl>, share_asian2010 <dbl>, share_black2000 <dbl>,
## #   share_white2000 <dbl>, share_hisp2000 <dbl>, share_asian2000 <dbl>,
## #   gsmn_math_g3_2013 <dbl>, rent_twobed2015 <dbl>, …
# Check the dimensions of the new dataframe
dim(san_antonio_data)
## [1] 462  38
library(ggplot2)

# Create the histogram for household income
ggplot(san_antonio_data, aes(x = hhinc_mean2000)) +
  geom_histogram(binwidth = 5000, fill = "blue", color = "black") +
  labs(title = "Histogram of Household Income (San Antonio)", 
       x = "Household Income in 2000", 
       y = "Frequency") +
  theme_minimal()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).

library(ggplot2)

# Create the boxplot for population density
ggplot(san_antonio_data, aes(y = popdensity2000)) +
  geom_boxplot(fill = "lightblue", color = "black") +
  labs(title = "Boxplot of Population Density (San Antonio)", 
       y = "Population Density in 2000") +
  theme_minimal()

library(ggplot2)

# Create the PDF plot for household income (hhinc_mean2000)
ggplot(san_antonio_data, aes(x = hhinc_mean2000)) +
  geom_density(fill = "lightblue", color = "black", alpha = 0.7) +
  labs(title = "Probability Density Function of Household Income (San Antonio)",
       x = "Household Income in 2000", 
       y = "Density") +
  theme_minimal()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).

library(ggplot2)

# Create the CDF plot for household income (hhinc_mean2000)
ggplot(san_antonio_data, aes(x = hhinc_mean2000)) +
  stat_ecdf(geom = "step", color = "blue") +
  labs(title = "Cumulative Density Function of Household Income (San Antonio)",
       x = "Household Income in 2000",
       y = "Cumulative Probability") +
  theme_minimal()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_ecdf()`).

library(ggplot2)

# Create the scatter plot for population density and household income
ggplot(san_antonio_data, aes(x = popdensity2000, y = hhinc_mean2000)) +
  geom_point(color = "blue") +
  labs(title = "Scatter Plot of Population Density and Household Income (San Antonio)",
       x = "Population Density (2000)",
       y = "Household Income (2000)") +
  theme_minimal()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Load required library
library(ggplot2)

# Create CDF plot with axis labels
ggplot(san_antonio_data, aes(x = hhinc_mean2000)) +
  stat_ecdf(geom = "step", color = "blue") +
  labs(title = "Cumulative Density Function (CDF) of Household Income in San Antonio",
       x = "Household Income (Mean, 2000)",
       y = "Cumulative Probability") +
  theme_minimal()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_ecdf()`).

# Load the plotly library
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
# Create an interactive scatter plot using plotly
fig <- plot_ly(
  data = san_antonio_data, 
  x = ~popdensity2000, 
  y = ~hhinc_mean2000, 
  type = 'scatter', 
  mode = 'markers',
  marker = list(size = 10, color = 'rgba(0, 152, 255, .8)', line = list(width = 2))
)

# Add axis labels
fig <- fig %>% layout(
  title = "Scatter Plot of Population Density and Household Income for San Antonio",
  xaxis = list(title = "Population Density (2000)"),
  yaxis = list(title = "Household Income (Mean, 2000)")
)

# Show the plot
fig
## Warning: Ignoring 2 observations