library(readr)
tract_covariates <- read_csv("tract_covariates.csv")
## Rows: 74044 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): czname
## dbl (37): state, county, tract, cz, hhinc_mean2000, mean_commutetime2000, fr...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(tract_covariates)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
data <- read.csv("C:/Users/campo/Downloads/tract_covariates.csv")
df_filtered <- data %>% select(czname, hhinc_mean2000, popdensity2000)
df_san_antonio <- df_filtered %>% filter(czname == "San Antonio")
ggplot(df_san_antonio, aes(x = hhinc_mean2000)) +
geom_histogram(binwidth = 5000, fill = "skyblue", color = "black") +
labs(title = "Histogram of Household Income (2000) - San Antonio",
x = "Household Income (mean, 2000)", y = "Frequency") +
theme_minimal()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).

ggplot(df_san_antonio, aes(x = popdensity2000)) +
geom_boxplot(fill = "red") +
labs(title = "Boxplot of Population Density (2000) - San Antonio",
x = "Population Density (2000)") +
theme_minimal()

ggplot(df_san_antonio, aes(x = hhinc_mean2000)) +
geom_density(fill = "yellow") +
labs(title = "PDF of Household Income (2000) - San Antonio",
x = "Household Income (mean, 2000)", y = "Density") +
theme_minimal()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).

ggplot(df_san_antonio, aes(x = hhinc_mean2000)) +
stat_ecdf(geom = "step", color = "green") +
labs(title = "CDF of Household Income (2000) - San Antonio",
x = "Household Income (mean, 2000)", y = "Cumulative Probability") +
theme_minimal()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_ecdf()`).

ggplot(df_san_antonio, aes(x = popdensity2000, y = hhinc_mean2000)) +
geom_point(color = "orange") +
labs(title = "Scatter Plot of Population Density vs Household Income",
x = "Population Density (2000)", y = "Household Income (mean, 2000)") +
theme_minimal()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

plot_ly(df_san_antonio, x = ~popdensity2000, y = ~hhinc_mean2000, type = 'scatter', mode = 'markers') %>%
layout(title = "Scatter Plot of Population Density vs Household Income",
xaxis = list(title = "Population Density (2000)"),
yaxis = list(title = "Household Income (mean, 2000)"))
## Warning: Ignoring 2 observations
sum(is.na(df_san_antonio$hhinc_mean2000))
## [1] 2
sum(is.na(df_san_antonio$popdensity2000))
## [1] 0
df_san_antonio_clean <- na.omit(df_san_antonio)