Project 1

library(tidycensus)
library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)

# Research question What is the relationship between Hispanic and Black populations and 
#the number of people living in poverty?

# Answer to Question, There is a positive relationship between the Hispanic and Black populations and the number
#of people living in poverty in the dataset. Increases in these populations correlate with higher poverty counts, 
#suggesting demographic factors play a role in poverty rates, although they explain only about 24% of the variation.
#Other factors likely also influence poverty levels.

#Qustion 1
#Use Census API to get the census tract-level data with at least 4 variables

vars <- c(
  poptotal = 'B03002_001E', 
  black = 'B03002_004E', 
  poverty = 'B17017_002E',
  hispanic='B03002_012E')

phoenix_data <- get_acs(  geography = "tract",   state = "AZ",   county = "Maricopa", 
                     year = 2021,   output = "wide", variables = vars,)

## Getting data from the 2017-2021 5-year ACS

#Question 2
#Calculate mean, median, min, and max values for relevant Census variables


mean_poptotla <- mean(phoenix_data$poptotal)
median_black <- mean(phoenix_data$black)
max_poverty <- max(phoenix_data$poverty)
min_hispanic <- min(phoenix_data$hispanic)


#Question 3
#Make at least three types of figures (scatter plot, histogram plot, boxplot, bar plot, etc.) and summarize your findings 



ggplot(phoenix_data, aes(x = poptotal, y = poverty, color= poverty)) +  geom_point()

g <- ggplot(phoenix_data, aes(hispanic, black))


# Scatterplot
ggplot(phoenix_data, aes(x = poptotal, y = poverty)) +  geom_point()

ggplot(phoenix_data, aes(poptotal, poverty)) + geom_point()

g + geom_point() + 
  geom_smooth(method="lm", se=F) +
  labs(
       y="hispanic_pop", 
       x="black-pop", 
       title="Scatterplot with overlapping points", 
       caption="Source: midwest")

## `geom_smooth()` using formula = 'y ~ x'

ggplot(phoenix_data, aes(y = poptotal)) + 
  geom_boxplot()

ggplot(phoenix_data, aes(x = hispanic, y = black)) + 
  geom_boxplot()

## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

ggplot(phoenix_data, aes(x = hispanic)) + 
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Question 4
#Make at least a PDF (probability density function) chart or CDF (cumulative density function)
#chart, which should include at least two curves to show the differences

ggplot(phoenix_data, aes(x = hispanic, )) + 
  geom_density()

ggplot(phoenix_data, aes(x = poptotal)) + 
  stat_ecdf(geom = "step")

# Question 5 Make a prediction of population OR GDP OR other variable of your study area for 
#the next five years (2025-2030) (2'). You can find the relevant codes from lines 59-73 in

x <- c(2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022) #year
y <- c(3825183, 3875371, 3948165, 4018657, 4094842, 4174423, 4258019, 4329227, 4405306, 4492261,4440232,4494693,4551524)

#create a scatterplot of x vs. y
plot(x, y, pch=19, xlab='x', ylab='y')

#fit a linear model
poly.lm1 <- lm(y ~ poly(x, 1))
poly.lm2 <- lm(y ~ poly(x, 2))
poly.lm3 <- lm(y ~ poly(x, 3))


# To predict y for new x-values, make a data.frame: 
new.x1 <- seq(2025, 2030)
new.df <- data.frame(x=new.x1)


Projected <- predict(poly.lm1, newdata=new.df)
Projected2 <- predict(poly.lm2, newdata=new.df)
Projected3 <- predict(poly.lm3, newdata=new.df)






# Question 6 Make OLS regression analysis or correlation analysis to examine your research questions (2').
#You can find the relevant codes from Lines 6-24 in Population Projection.R Download Population Projection.
#ROpen this document with ReadSpeaker docReader. #Make at least one interactive plot or one interactive map (1'). 
#You can find the codes from Lines 86-89 in the file Visualization1.R Download Visualization1.ROpen this document 
#with ReadSpeaker docReaderfor the interactive plot, or use tmap_mode("view") in the tmap function for mapping.



data("mtcars")

# Fit an OLS regression model using tidyverse's pipeline
phoenix_data %>%
  lm(poverty ~ hispanic + black, data = .) %>%
  summary()

## 
## Call:
## lm(formula = poverty ~ hispanic + black, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -333.17  -83.81  -24.30   57.22  881.12 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 87.952650   6.293842   13.97  < 2e-16 ***
## hispanic     0.045975   0.003518   13.07  < 2e-16 ***
## black        0.098775   0.016087    6.14 1.19e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 130 on 1006 degrees of freedom
## Multiple R-squared:  0.2416, Adjusted R-squared:  0.2401 
## F-statistic: 160.2 on 2 and 1006 DF,  p-value: < 2.2e-16

phoenix_data2 <- phoenix_data %>%
  mutate(
    pct_Black = 100 * black / poptotal,   # Percentage of Black population
    pct_Hispanic = 100 * hispanic / poptotal )      # Percentage of Hispanic population




library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

p <- ggplot(phoenix_data2, aes(x = pct_Hispanic, y = poptotal)) + 
  labs(
    y="poptotal", 
    x="pct hispanic", 
    title="Hispanic Population and Total Population in Phoenix", 
    caption="Source: midwest")+
  geom_point()
ggplotly(p)

#In this analysis of Maricopa County, AZ, census data was retrieved for total population, Black and Hispanic populations, and poverty levels. Visualizations indicated that higher percentages of Black and Hispanic populations were associated with higher poverty levels. OLS regression confirmed that these demographic factors had a significant, positive relationship with poverty. A population forecast showed steady growth from 2025 to 2030. Interactive scatter plots were also created for deeper analysis, illustrating the trends between demographic characteristics and socio-economic outcomes.

Project 1

Dagoberto Cantu

2024-11-05