library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
# Read the dataset
url <- "https://opportunityinsights.org/wp-content/uploads/2018/10/tract_covariates.csv"
census_data <- read.csv(url)
# Check the structure of the dataset
str(census_data)
## 'data.frame': 74044 obs. of 38 variables:
## $ state : int 1 1 1 1 1 1 1 1 1 1 ...
## $ county : int 1 1 1 1 1 1 1 1 1 1 ...
## $ tract : int 20100 20200 20300 20400 20500 20600 20700 20801 20802 20900 ...
## $ cz : int 11101 11101 11101 11101 11101 11101 11101 11101 11101 11101 ...
## $ czname : chr "Montgomery" "Montgomery" "Montgomery" "Montgomery" ...
## $ hhinc_mean2000 : num 68639 57243 75648 74852 96175 ...
## $ mean_commutetime2000 : num 26.2 24.8 25.3 23 26.2 ...
## $ frac_coll_plus2000 : num 0.156 0.147 0.224 0.23 0.321 ...
## $ frac_coll_plus2010 : num 0.254 0.267 0.164 0.253 0.375 ...
## $ foreign_share2010 : num 0.00995 0.01634 0.0271 0.01508 0.04649 ...
## $ med_hhinc1990 : num 27375 19000 29419 37891 41516 ...
## $ med_hhinc2016 : int 66000 41107 51250 52704 52463 63750 45234 74603 61242 44591 ...
## $ popdensity2000 : num 196 566 624 714 530 ...
## $ poor_share2010 : num 0.105 0.1476 0.0804 0.0632 0.0596 ...
## $ poor_share2000 : num 0.1268 0.2271 0.0766 0.0455 0.0368 ...
## $ poor_share1990 : num 0.0989 0.1983 0.114 0.0679 0.0547 ...
## $ share_white2010 : num 0.837 0.389 0.752 0.919 0.784 ...
## $ share_black2010 : num 0.1192 0.565 0.198 0.0467 0.1397 ...
## $ share_hisp2010 : num 0.023 0.0346 0.0258 0.0194 0.033 ...
## $ share_asian2010 : num 0.00471 0.0023 0.00474 0.00365 0.02603 ...
## $ share_black2000 : num 0.0755 0.6221 0.1491 0.0259 0.0601 ...
## $ share_white2000 : num 0.897 0.355 0.82 0.938 0.897 ...
## $ share_hisp2000 : num 0.00625 0.00846 0.01647 0.02217 0.01573 ...
## $ share_asian2000 : num 0.00364 0.00317 0.00389 0.00729 0.0106 ...
## $ gsmn_math_g3_2013 : num 2.76 2.76 2.76 2.76 2.76 ...
## $ rent_twobed2015 : int NA 907 583 713 923 765 645 532 671 710 ...
## $ singleparent_share2010 : num 0.114 0.488 0.228 0.228 0.26 ...
## $ singleparent_share1990 : num 0.1812 0.3525 0.1259 0.1268 0.0744 ...
## $ singleparent_share2000 : num 0.251 0.393 0.245 0.191 0.168 ...
## $ traveltime15_2010 : num 0.273 0.152 0.206 0.351 0.25 ...
## $ emp2000 : num 0.567 0.493 0.579 0.597 0.661 ...
## $ mail_return_rate2010 : num 83.5 81.3 79.5 83.5 77.3 ...
## $ ln_wage_growth_hs_grad : num 0.0382 0.0893 -0.1777 -0.0723 -0.0961 ...
## $ jobs_total_5mi_2015 : int 10109 9948 10387 12933 12933 9193 11578 1567 2615 279 ...
## $ jobs_highpay_5mi_2015 : int 3396 3328 3230 3635 3635 3052 3389 989 1176 106 ...
## $ popdensity2010 : num 505 1682 1633 1780 2446 ...
## $ ann_avg_job_growth_2004_2013: num -0.00677 -0.00425 0.01422 -0.01984 0.01863 ...
## $ job_density_2013 : num 92.1 971.3 340.9 207.4 800.3 ...
# Create a new dataframe with selected variables
selected_data <- census_data %>%
select(czname, hhinc_mean2000, popdensity2000)
# View the new dataframe
head(selected_data)
## czname hhinc_mean2000 popdensity2000
## 1 Montgomery 68638.73 195.7238
## 2 Montgomery 57242.51 566.3814
## 3 Montgomery 75647.73 624.1968
## 4 Montgomery 74852.05 713.8040
## 5 Montgomery 96174.77 529.9303
## 6 Montgomery 68095.77 408.3740
# Create a new dataframe for San Antonio only
san_antonio_data <- selected_data %>%
filter(czname == "San Antonio")
# View the filtered dataframe
head(san_antonio_data)
## czname hhinc_mean2000 popdensity2000
## 1 San Antonio 60732.74 6.315087
## 2 San Antonio 64234.03 21.576385
## 3 San Antonio 60457.78 22.751017
## 4 San Antonio 44444.81 158.473980
## 5 San Antonio 64706.38 38.283703
## 6 San Antonio 64270.25 122.809260
sum(is.na(san_antonio_data$popdensity2000))
## [1] 0
sum(is.na(san_antonio_data$hhinc_mean2000))
## [1] 2
san_antonio_data <- na.omit(san_antonio_data)
# Histogram of household income for San Antonio
ggplot(san_antonio_data, aes(x = hhinc_mean2000)) +
geom_histogram(binwidth = 5000, fill = "orange", color = "black") +
labs(title = "Household Income Distribution in San Antonio",
x = "Household Income (2000)",
y = "Frequency")

# Boxplot of population density for San Antonio
ggplot(san_antonio_data, aes(y = popdensity2000)) +
geom_boxplot(fill = "purple") +
labs(title = "Population Density in San Antonio",
y = "Population Density (2000)")

# Probability Density Function of household income
ggplot(san_antonio_data, aes(x = hhinc_mean2000)) +
geom_density(fill = "purple", alpha = 0.7) +
labs(title = "Probability Density Function of Household Income",
x = "Household Income (2000)",
y = "Density")

# Cumulative Density Function of household income
ggplot(san_antonio_data, aes(x = hhinc_mean2000)) +
stat_ecdf(geom = "step", color = "red") +
labs(title = "Cumulative Density Function of Household Income",
x = "Household Income (2000)",
y = "Cumulative Probability")

# Scatter plot of population density vs household income
ggplot(san_antonio_data, aes(x = popdensity2000, y = hhinc_mean2000)) +
geom_point(color = "darkblue") +
labs(title = "Scatter Plot of Population Density vs. Household Income",
x = "Population Density (2000)",
y = "Household Income (2000)")

# Create an interactive scatter plot using plotly with specified trace type
interactive_plot <- plot_ly(data = san_antonio_data,
x = ~popdensity2000,
y = ~hhinc_mean2000,
type = 'scatter', # Specify trace type
mode = 'markers',
marker = list(color = 'blue')) %>%
layout(title = "Interactive Scatter Plot of Population Density vs. Household Income",
xaxis = list(title = "Population Density (2000)"),
yaxis = list(title = "Household Income (2000)"))
# Show the interactive plot
interactive_plot