tidycensus::census_api_key(key = "8c44183ca40ce5fa3fcaf95cc87927d974513b9f", install=T)

## Error: A CENSUS_API_KEY already exists. You can overwrite it with the argument overwrite=TRUE

library(tidycensus)
library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(spdep)

## Loading required package: sp

## Loading required package: spData

## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`

## Loading required package: sf

## Linking to GEOS 3.9.0, GDAL 3.2.1, PROJ 7.2.1

library(sf)
library(ggplot2)
library(dplyr)
library(rgdal)

## rgdal: version: 1.5-23, (SVN revision 1121)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 3.2.1, released 2020/12/29
## Path to GDAL shared files: C:/Users/chris/OneDrive/Documents/R/win-library/4.1/rgdal/gdal
## GDAL binary built with GEOS: TRUE 
## Loaded PROJ runtime: Rel. 7.2.1, January 1st, 2021, [PJ_VERSION: 721]
## Path to PROJ shared files: C:/Users/chris/OneDrive/Documents/R/win-library/4.1/rgdal/proj
## PROJ CDN enabled: FALSE
## Linking to sp version:1.4-5
## To mute warnings of possible GDAL/OSR exportToProj4() degradation,
## use options("rgdal_show_exportToProj4_warnings"="none") before loading rgdal.
## Overwritten PROJ_LIB was C:/Users/chris/OneDrive/Documents/R/win-library/4.1/rgdal/proj

library(maptools)

## Checking rgeos availability: TRUE

library(RColorBrewer)
library(sp)

A. Download ACS data for the year 2015 for Los Angeles County, Califonia census tracts

ca_acs <- get_acs(geography = "tract",
                    year = 2015,
                    state = "cA", 
                    county = 037, 
                    variables = c("DP05_0001E","DP05_0073PE", 
                                  "DP05_0066PE"),
                    output ="wide", 
                    geometry = TRUE)

## Getting data from the 2011-2015 5-year ACS

## Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.

## Using the ACS Data Profile

Rename variables and filter missing cases

dat<-ca_acs%>%
mutate(totpop=DP05_0001E, nhblack=DP05_0073PE,
       hispanic=DP05_0066PE)%>%
  dplyr::select(GEOID,NAME,totpop, nhblack,hispanic)%>%
  na.omit()

head(dat)

a. Proportion of the Population that is Hispanic

dat%>%
  mutate(hispanicq=cut(hispanic,
                   breaks = quantile(dat$hispanic,
                                     p=c(0, .25, .5, .75, 1), na.rm=T),
                   include.lowest = T))%>%
  ggplot()+geom_sf(aes(fill=hispanicq, color=hispanicq))+
  scale_fill_brewer(palette = "Reds")+
  scale_color_brewer(palette = "Reds")+
  ggtitle("Proportion of Hispanic Population in Los Angeles, 2015")

## Proportion of the Population that is Non-Hispanic Black

dat%>%
  mutate(nhblackq=cut(nhblack,
                   breaks = quantile(dat$nhblack,
                                     p=c(0, .25, .5, .75, 1), na.rm=T),
                   include.lowest = T))%>%
  ggplot()+geom_sf(aes(fill=nhblackq, color=nhblackq))+
  scale_fill_brewer(palette = "Blues")+
  scale_color_brewer(palette = "Blueses")+
  ggtitle("Proportion Non-Hispanic Black Population in Los Angeles, 2015")

## Warning in pal_name(palette, type): Unknown palette Blueses

# B Construct a Queen-based continguity and a K=4 nearest neighbor list

wadt<-poly2nb(dat, queen=T)  # Queen-based contiguity neighbor list
summary(wadt)

## Neighbour list object:
## Number of regions: 2326 
## Number of nonzero links: 14794 
## Percentage nonzero weights: 0.2734426 
## Average number of links: 6.360275 
## Link number distribution:
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  19  20  25 
##   4   9  49 200 464 609 494 273 134  51  23   5   2   2   2   2   1   1   1 
## 4 least connected regions:
## 145 266 1341 1342 with 1 link
## 1 most connected region:
## 1025 with 25 links

OA<-nb2listw(wadt, style="W")
knn<-knearneigh(x=coordinates(as(dat, "Spatial")), k=4)  # K= 4 nearest neighbor
knn4<-knn2nb(knn=knn)

C. Produce a map for each of the two outcomes in R

plot(as(dat, "Spatial"),
     main="Queen-based Contiguity")
plot(OA,
     coords=coordinates(as(dat, "Spatial")),
     add=T,
     col=2)

plot(as(dat, "Spatial"),
     main="k=4 Neighbors")
plot(knn4,
     coords=coordinates(as(dat, "Spatial")),
     add=T,
     col=2)

D. Test the global null hypothesis of independence of theses two outcomes, for each neighbor rule (Queen and k=4)

Be sure to report the global Moran I statistic for each outcome, and the associated test statistic

Moran’s I Analysis of Proportion of Hispanic Population in Los Angeles, 2015

moran.test(dat$hispanic, 
           listw=OA) # Queen's case

## 
##  Moran I test under randomisation
## 
## data:  dat$hispanic  
## weights: OA    
## 
## Moran I statistic standard deviate = 68.81, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##      0.8165201722     -0.0004301075      0.0001409572

moran.test(dat$nhblack, 
           listw=OA) # Queen's case

## 
##  Moran I test under randomisation
## 
## data:  dat$nhblack  
## weights: OA    
## 
## Moran I statistic standard deviate = 66.482, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##      0.7867342028     -0.0004301075      0.0001401897

The Moran I statistics for both Queen-based contiguity for Hispanics and non-Hispanic black show that there’s not much of a visible difference between the two outcomes. The moran I statistical value of 0.8165 for hispanics and 0.7867 for non-hispanic black indicates positive autocorrelation meaning there is spatial correlation between the two races. This also means hispanics in tract cluster to each other geographically compared to non-hispanic blacks

K= 4 Case

knn4lw<-nb2listw(knn4)
moran.test(dat$hispanic, listw=knn4lw)

## 
##  Moran I test under randomisation
## 
## data:  dat$hispanic  
## weights: knn4lw    
## 
## Moran I statistic standard deviate = 60.208, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##      0.8315955900     -0.0004301075      0.0001909709

knn4lw<-nb2listw(knn4)
moran.test(dat$nhblack, listw=knn4lw)

## 
##  Moran I test under randomisation
## 
## data:  dat$nhblack  
## weights: knn4lw    
## 
## Moran I statistic standard deviate = 56.32, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##      0.7757472131     -0.0004301075      0.0001899312

Similarly, the Moran I statistics for K=4 neighbors for Hispanics and non-Hispanic black show that there’s not much of a visible difference between the two outcomes. The moran I statistical value of 0.8316 for hispanics and 0.7757 for non-hispanic black indicates positive autocorrelation meaning there is spatial correlation between the two races. ## b. Describe if the two neighbor types give the same results

The two neighbor types both have positive autocorrelation with values not much of a visible difference and same pattern of autocorrelation.

E. Perform a local Moran I analysis

# Local Moran I for Proportion of Hispanic Population

sot<-nb2listw(neighbours = knn4, style = "W")

locali<-localmoran(dat$hispanic, listw = sot, alternative = "two.sided" )
dat$locali<-locali[,1]
dat$localp<-locali[,5]

dat$shispnc<-scale(dat$hispanic)
dat$lag_hispnc<-lag.listw(var=dat$shispnc, x = sot)
dat$quad_sig <- NA
dat$quad_sig[(dat$shispnc >= 0 & dat$lag_hispnc >= 0) & (dat$localp <= 0.05)] <- "H-H" #high high
dat$quad_sig[(dat$shispnc <= 0 & dat$lag_hispnc <= 0) & (dat$localp <= 0.05)] <- "L-L" #low low
dat$quad_sig[(dat$shispnc >= 0 & dat$lag_hispnc <= 0) & (dat$localp <= 0.05)] <- "H-L" #high low
dat$quad_sig[(dat$shispnc <= 0 & dat$lag_hispnc >= 0) & (dat$localp <= 0.05)] <- "L-H" #low high

#WE ASSIGN A # Set the breaks for the thematic map classes
breaks <- seq(1, 5, 1)

# Set the corresponding labels for the thematic map classes
labels <- c("High-High", "Low-Low", "High-Low", "Low-High", "Not Clustered")

# see ?findInterval - This is necessary for making a map
np <- findInterval(dat$quad_sig, breaks)

## Warning in findInterval(dat$quad_sig, breaks): NAs introduced by coercion

# Assign colors to each map class
colors <- c("red", "blue", "lightpink", "skyblue2", "white")

dat%>%
  ggplot()+
  geom_sf(aes(fill = quad_sig))+
  ggtitle("Moran LISA Cluster Map -Proportion of Hispanic Population",
          sub=" Los Angeles County, CA")

# Local Moran I for Proportion of Non-Hispanic Black Population

sot<-nb2listw(neighbours = knn4, style = "W")

locali<-localmoran(dat$nhblack, listw = sot, alternative = "two.sided" )
dat$locali<-locali[,1]
dat$localp<-locali[,5]

dat$snhblk<-scale(dat$nhblack)
dat$lag_nhblk<-lag.listw(var=dat$snhblk, x = sot)
dat$quad_sig <- NA
dat$quad_sig[(dat$snhblk >= 0 & dat$lag_nhblk >= 0) & (dat$localp <= 0.05)] <- "H-H" #high high
dat$quad_sig[(dat$snhblk <= 0 & dat$lag_nhblk <= 0) & (dat$localp <= 0.05)] <- "L-L" #low low
dat$quad_sig[(dat$snhblk >= 0 & dat$lag_nhblk <= 0) & (dat$localp <= 0.05)] <- "H-L" #high low
dat$quad_sig[(dat$snhblk <= 0 & dat$lag_nhblk >= 0) & (dat$localp <= 0.05)] <- "L-H" #low high

#WE ASSIGN A # Set the breaks for the thematic map classes
breaks <- seq(1, 5, 1)

# Set the corresponding labels for the thematic map classes
labels <- c("High-High", "Low-Low", "High-Low", "Low-High", "Not Clustered")

# see ?findInterval - This is necessary for making a map
np <- findInterval(dat$quad_sig, breaks)

## Warning in findInterval(dat$quad_sig, breaks): NAs introduced by coercion

# Assign colors to each map class
colors <- c("red", "blue", "lightpink", "skyblue2", "white")

dat%>%
  ggplot()+
  geom_sf(aes(fill = quad_sig))+
  ggtitle("Moran LISA Cluster Map -Proportion of Non-Hispanic Black",
          sub=" Los Angeles County, CA")

# F. Interpret your findings from the local Moran I analysis.

The Local Moran I analysis for Hispanics indicates varying autocorrelation of high and high, high and low, and low and low patterns. This means Hispanics of higher social status are more likely to live in same neighborhood with Hispanics of the same class. People of lower economic status are also likely to live in a poor area with others of similar background. This means you have wealthy hispanics living next to other affluent hispanics in affluent neighborhoods and poor neighborhoods next to other poor neighborhoods. This indicates both positive and negetaive spatial autocorrelation.

The Local Moran I for non-Hispanic blacks just like the hispanics indicates high and high, and low and high autocorrelation patterns. Meaning affluent blacks living in neighborhoods close to other affluent neighborhoods, and low-high, which is a negative autocorrelation. This situation or scenarios of low and high does occur due to gentrification where new home construction or new business construction are happening in mostly deprived communities. In a nutshell, both analysis depcits both positive and negetaive autocorrelation patterns due to the features of the tracts.

DEM 7263 -Exploratory Spatial Data Analysis- HMWORK 1

Assogba Albert

9/13/2021