options(repos = c(CRAN = "https://cloud.r-project.org"))
install.packages(c("readr", "dplyr"))
##
## The downloaded binary packages are in
## /var/folders/4b/1p0sp0rs33xg19wq8_5j5lm40000gn/T//Rtmp7I3wF8/downloaded_packages
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
url <- "https://opportunityinsights.org/wp-content/uploads/2018/10/tract_covariates.csv"
data <- read_csv(url)
## Rows: 74044 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): czname
## dbl (37): state, county, tract, cz, hhinc_mean2000, mean_commutetime2000, fr...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# View the first few rows of the dataset
head(data)
## # A tibble: 6 × 38
## state county tract cz czname hhinc_mean2000 mean_commutetime2000
## <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 1 1 20100 11101 Montgomery 68639. 26.2
## 2 1 1 20200 11101 Montgomery 57243. 24.8
## 3 1 1 20300 11101 Montgomery 75648. 25.3
## 4 1 1 20400 11101 Montgomery 74852. 23.0
## 5 1 1 20500 11101 Montgomery 96175. 26.2
## 6 1 1 20600 11101 Montgomery 68096. 21.6
## # ℹ 31 more variables: frac_coll_plus2000 <dbl>, frac_coll_plus2010 <dbl>,
## # foreign_share2010 <dbl>, med_hhinc1990 <dbl>, med_hhinc2016 <dbl>,
## # popdensity2000 <dbl>, poor_share2010 <dbl>, poor_share2000 <dbl>,
## # poor_share1990 <dbl>, share_white2010 <dbl>, share_black2010 <dbl>,
## # share_hisp2010 <dbl>, share_asian2010 <dbl>, share_black2000 <dbl>,
## # share_white2000 <dbl>, share_hisp2000 <dbl>, share_asian2000 <dbl>,
## # gsmn_math_g3_2013 <dbl>, rent_twobed2015 <dbl>, …
# Check the structure of the dataset
str(data)
## spc_tbl_ [74,044 × 38] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ state : num [1:74044] 1 1 1 1 1 1 1 1 1 1 ...
## $ county : num [1:74044] 1 1 1 1 1 1 1 1 1 1 ...
## $ tract : num [1:74044] 20100 20200 20300 20400 20500 ...
## $ cz : num [1:74044] 11101 11101 11101 11101 11101 ...
## $ czname : chr [1:74044] "Montgomery" "Montgomery" "Montgomery" "Montgomery" ...
## $ hhinc_mean2000 : num [1:74044] 68639 57243 75648 74852 96175 ...
## $ mean_commutetime2000 : num [1:74044] 26.2 24.8 25.3 23 26.2 ...
## $ frac_coll_plus2000 : num [1:74044] 0.156 0.147 0.224 0.23 0.321 ...
## $ frac_coll_plus2010 : num [1:74044] 0.254 0.267 0.164 0.253 0.375 ...
## $ foreign_share2010 : num [1:74044] 0.00995 0.01634 0.0271 0.01508 0.04649 ...
## $ med_hhinc1990 : num [1:74044] 27375 19000 29419 37891 41516 ...
## $ med_hhinc2016 : num [1:74044] 66000 41107 51250 52704 52463 ...
## $ popdensity2000 : num [1:74044] 196 566 624 714 530 ...
## $ poor_share2010 : num [1:74044] 0.105 0.1476 0.0804 0.0632 0.0596 ...
## $ poor_share2000 : num [1:74044] 0.1268 0.2271 0.0766 0.0455 0.0368 ...
## $ poor_share1990 : num [1:74044] 0.0989 0.1983 0.114 0.0679 0.0547 ...
## $ share_white2010 : num [1:74044] 0.837 0.389 0.752 0.919 0.784 ...
## $ share_black2010 : num [1:74044] 0.1192 0.565 0.198 0.0467 0.1397 ...
## $ share_hisp2010 : num [1:74044] 0.023 0.0346 0.0258 0.0194 0.033 ...
## $ share_asian2010 : num [1:74044] 0.00471 0.0023 0.00474 0.00365 0.02603 ...
## $ share_black2000 : num [1:74044] 0.0755 0.6221 0.1491 0.0259 0.0601 ...
## $ share_white2000 : num [1:74044] 0.897 0.355 0.82 0.938 0.897 ...
## $ share_hisp2000 : num [1:74044] 0.00625 0.00846 0.01647 0.02217 0.01573 ...
## $ share_asian2000 : num [1:74044] 0.00364 0.00317 0.00389 0.00729 0.0106 ...
## $ gsmn_math_g3_2013 : num [1:74044] 2.76 2.76 2.76 2.76 2.76 ...
## $ rent_twobed2015 : num [1:74044] NA 907 583 713 923 765 645 532 671 710 ...
## $ singleparent_share2010 : num [1:74044] 0.114 0.488 0.228 0.228 0.26 ...
## $ singleparent_share1990 : num [1:74044] 0.1812 0.3525 0.1259 0.1268 0.0744 ...
## $ singleparent_share2000 : num [1:74044] 0.251 0.393 0.245 0.191 0.168 ...
## $ traveltime15_2010 : num [1:74044] 0.273 0.152 0.206 0.351 0.25 ...
## $ emp2000 : num [1:74044] 0.567 0.493 0.579 0.597 0.661 ...
## $ mail_return_rate2010 : num [1:74044] 83.5 81.3 79.5 83.5 77.3 ...
## $ ln_wage_growth_hs_grad : num [1:74044] 0.0382 0.0893 -0.1777 -0.0723 -0.0961 ...
## $ jobs_total_5mi_2015 : num [1:74044] 10109 9948 10387 12933 12933 ...
## $ jobs_highpay_5mi_2015 : num [1:74044] 3396 3328 3230 3635 3635 ...
## $ popdensity2010 : num [1:74044] 505 1682 1633 1780 2446 ...
## $ ann_avg_job_growth_2004_2013: num [1:74044] -0.00677 -0.00425 0.01422 -0.01984 0.01863 ...
## $ job_density_2013 : num [1:74044] 92.1 971.3 340.9 207.4 800.3 ...
## - attr(*, "spec")=
## .. cols(
## .. state = col_double(),
## .. county = col_double(),
## .. tract = col_double(),
## .. cz = col_double(),
## .. czname = col_character(),
## .. hhinc_mean2000 = col_double(),
## .. mean_commutetime2000 = col_double(),
## .. frac_coll_plus2000 = col_double(),
## .. frac_coll_plus2010 = col_double(),
## .. foreign_share2010 = col_double(),
## .. med_hhinc1990 = col_double(),
## .. med_hhinc2016 = col_double(),
## .. popdensity2000 = col_double(),
## .. poor_share2010 = col_double(),
## .. poor_share2000 = col_double(),
## .. poor_share1990 = col_double(),
## .. share_white2010 = col_double(),
## .. share_black2010 = col_double(),
## .. share_hisp2010 = col_double(),
## .. share_asian2010 = col_double(),
## .. share_black2000 = col_double(),
## .. share_white2000 = col_double(),
## .. share_hisp2000 = col_double(),
## .. share_asian2000 = col_double(),
## .. gsmn_math_g3_2013 = col_double(),
## .. rent_twobed2015 = col_double(),
## .. singleparent_share2010 = col_double(),
## .. singleparent_share1990 = col_double(),
## .. singleparent_share2000 = col_double(),
## .. traveltime15_2010 = col_double(),
## .. emp2000 = col_double(),
## .. mail_return_rate2010 = col_double(),
## .. ln_wage_growth_hs_grad = col_double(),
## .. jobs_total_5mi_2015 = col_double(),
## .. jobs_highpay_5mi_2015 = col_double(),
## .. popdensity2010 = col_double(),
## .. ann_avg_job_growth_2004_2013 = col_double(),
## .. job_density_2013 = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
# Check the dimensions (number of rows and columns)
dim(data)
## [1] 74044 38
# Create a new dataframe with only the selected variables
df_selected <- data[, c("czname", "hhinc_mean2000", "popdensity2000")]
# View the first few rows of the new dataframe
head(df_selected)
## # A tibble: 6 × 3
## czname hhinc_mean2000 popdensity2000
## <chr> <dbl> <dbl>
## 1 Montgomery 68639. 196.
## 2 Montgomery 57243. 566.
## 3 Montgomery 75648. 624.
## 4 Montgomery 74852. 714.
## 5 Montgomery 96175. 530.
## 6 Montgomery 68096. 408.
library(dplyr)
# Create a new dataframe for San Antonio only
san_antonio_data <- data %>% filter(czname == "San Antonio")
# View the first few rows of the San Antonio dataframe
head(san_antonio_data)
## # A tibble: 6 × 38
## state county tract cz czname hhinc_mean2000 mean_commutetime2000
## <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 48 13 960100 31301 San Antonio 60733. 34.6
## 2 48 13 960201 31301 San Antonio 64234. 38.2
## 3 48 13 960202 31301 San Antonio 60458. 40.8
## 4 48 13 960300 31301 San Antonio 44445. 32.7
## 5 48 13 960401 31301 San Antonio 64706. 38.6
## 6 48 13 960402 31301 San Antonio 64270. 28.6
## # ℹ 31 more variables: frac_coll_plus2000 <dbl>, frac_coll_plus2010 <dbl>,
## # foreign_share2010 <dbl>, med_hhinc1990 <dbl>, med_hhinc2016 <dbl>,
## # popdensity2000 <dbl>, poor_share2010 <dbl>, poor_share2000 <dbl>,
## # poor_share1990 <dbl>, share_white2010 <dbl>, share_black2010 <dbl>,
## # share_hisp2010 <dbl>, share_asian2010 <dbl>, share_black2000 <dbl>,
## # share_white2000 <dbl>, share_hisp2000 <dbl>, share_asian2000 <dbl>,
## # gsmn_math_g3_2013 <dbl>, rent_twobed2015 <dbl>, …
# Check the dimensions of the new dataframe
dim(san_antonio_data)
## [1] 462 38
library(ggplot2)
# Create the histogram for household income
ggplot(san_antonio_data, aes(x = hhinc_mean2000)) +
geom_histogram(binwidth = 5000, fill = "blue", color = "black") +
labs(title = "Histogram of Household Income (San Antonio)",
x = "Household Income in 2000",
y = "Frequency") +
theme_minimal()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).

library(ggplot2)
# Create the boxplot for population density
ggplot(san_antonio_data, aes(y = popdensity2000)) +
geom_boxplot(fill = "lightblue", color = "black") +
labs(title = "Boxplot of Population Density (San Antonio)",
y = "Population Density in 2000") +
theme_minimal()

library(ggplot2)
# Create the PDF plot for household income (hhinc_mean2000)
ggplot(san_antonio_data, aes(x = hhinc_mean2000)) +
geom_density(fill = "lightblue", color = "black", alpha = 0.7) +
labs(title = "Probability Density Function of Household Income (San Antonio)",
x = "Household Income in 2000",
y = "Density") +
theme_minimal()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).

library(ggplot2)
# Create the CDF plot for household income (hhinc_mean2000)
ggplot(san_antonio_data, aes(x = hhinc_mean2000)) +
stat_ecdf(geom = "step", color = "blue") +
labs(title = "Cumulative Density Function of Household Income (San Antonio)",
x = "Household Income in 2000",
y = "Cumulative Probability") +
theme_minimal()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_ecdf()`).

library(ggplot2)
# Create the scatter plot for population density and household income
ggplot(san_antonio_data, aes(x = popdensity2000, y = hhinc_mean2000)) +
geom_point(color = "blue") +
labs(title = "Scatter Plot of Population Density and Household Income (San Antonio)",
x = "Population Density (2000)",
y = "Household Income (2000)") +
theme_minimal()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Load required library
library(ggplot2)
# Create CDF plot with axis labels
ggplot(san_antonio_data, aes(x = hhinc_mean2000)) +
stat_ecdf(geom = "step", color = "blue") +
labs(title = "Cumulative Density Function (CDF) of Household Income in San Antonio",
x = "Household Income (Mean, 2000)",
y = "Cumulative Probability") +
theme_minimal()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_ecdf()`).

# Load the plotly library
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
# Create an interactive scatter plot using plotly
fig <- plot_ly(
data = san_antonio_data,
x = ~popdensity2000,
y = ~hhinc_mean2000,
type = 'scatter',
mode = 'markers',
marker = list(size = 10, color = 'rgba(0, 152, 255, .8)', line = list(width = 2))
)
# Add axis labels
fig <- fig %>% layout(
title = "Scatter Plot of Population Density and Household Income for San Antonio",
xaxis = list(title = "Population Density (2000)"),
yaxis = list(title = "Household Income (Mean, 2000)")
)
# Show the plot
fig
## Warning: Ignoring 2 observations