library(readr)
tract_covariates <- read_csv("tract_covariates.csv")
## Rows: 74044 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): czname
## dbl (37): state, county, tract, cz, hhinc_mean2000, mean_commutetime2000, fr...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(tract_covariates)

library(ggplot2)

library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
data <- read.csv("C:/Users/campo/Downloads/tract_covariates.csv")

df_filtered <- data %>% select(czname, hhinc_mean2000, popdensity2000)

df_san_antonio <- df_filtered %>% filter(czname == "San Antonio")

ggplot(df_san_antonio, aes(x = hhinc_mean2000)) +
  geom_histogram(binwidth = 5000, fill = "skyblue", color = "black") +
  labs(title = "Histogram of Household Income (2000) - San Antonio",
       x = "Household Income (mean, 2000)", y = "Frequency") +
  theme_minimal()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).

ggplot(df_san_antonio, aes(x = popdensity2000)) +
  geom_boxplot(fill = "red") +
  labs(title = "Boxplot of Population Density (2000) - San Antonio",
       x = "Population Density (2000)") +
  theme_minimal()

ggplot(df_san_antonio, aes(x = hhinc_mean2000)) +
  geom_density(fill = "yellow") +
  labs(title = "PDF of Household Income (2000) - San Antonio",
       x = "Household Income (mean, 2000)", y = "Density") +
  theme_minimal()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).

ggplot(df_san_antonio, aes(x = hhinc_mean2000)) +
  stat_ecdf(geom = "step", color = "green") +
  labs(title = "CDF of Household Income (2000) - San Antonio",
       x = "Household Income (mean, 2000)", y = "Cumulative Probability") +
  theme_minimal()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_ecdf()`).

ggplot(df_san_antonio, aes(x = popdensity2000, y = hhinc_mean2000)) +
  geom_point(color = "orange") +
  labs(title = "Scatter Plot of Population Density vs Household Income",
       x = "Population Density (2000)", y = "Household Income (mean, 2000)") +
  theme_minimal()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

plot_ly(df_san_antonio, x = ~popdensity2000, y = ~hhinc_mean2000, type = 'scatter', mode = 'markers') %>%
  layout(title = "Scatter Plot of Population Density vs Household Income",
         xaxis = list(title = "Population Density (2000)"),
         yaxis = list(title = "Household Income (mean, 2000)"))
## Warning: Ignoring 2 observations
sum(is.na(df_san_antonio$hhinc_mean2000))
## [1] 2
sum(is.na(df_san_antonio$popdensity2000))
## [1] 0
df_san_antonio_clean <- na.omit(df_san_antonio)