I am using the data for state poverty index.First I tried to understand the summary of all the variables and look at the maximum, minimum and range of variables. I used histograms to understand the distributions patterns of the varaibles. From the plots we can see poverty and urban education percentage are normally distrubuted and the other variables are skewed.
#Summary Statistics
library(xlsx)
## Warning: package 'xlsx' was built under R version 4.1.3
library(sp) # load library sp
library(boot)
## Warning: package 'boot' was built under R version 4.1.3
data = read.xlsx("D:/OneDrive - Harrisburg University/HU-Analytics/555-Spatial-Analytics/Project/StatesPovertyIndex.xlsx",sheetIndex = 1)
names(data)
## [1] "Name" "PovertyPct" "UnemploymentPct"
## [4] "MedianHouseholdIncome" "FIPS." "EducationPct"
## [7] "EducationPct_Urban" "TotalExports1Million"
summary(data)
## Name PovertyPct UnemploymentPct MedianHouseholdIncome
## Length:50 Min. : 7.60 Min. :2.400 Min. :44038
## Class :character 1st Qu.:10.70 1st Qu.:3.025 1st Qu.:55294
## Mode :character Median :12.65 Median :3.500 Median :60352
## Mean :12.80 Mean :3.602 Mean :61648
## 3rd Qu.:14.10 3rd Qu.:4.000 3rd Qu.:69633
## Max. :19.80 Max. :6.100 Max. :83076
## FIPS. EducationPct EducationPct_Urban TotalExports1Million
## Length:50 Min. :20.26 Min. :22.94 Min. : 16.64
## Class :character 1st Qu.:27.20 1st Qu.:29.59 1st Qu.: 458.59
## Mode :character Median :30.01 Median :32.93 Median : 1838.14
## Mean :30.65 Mean :33.08 Mean : 2778.31
## 3rd Qu.:33.27 3rd Qu.:36.35 3rd Qu.: 3537.90
## Max. :42.91 Max. :43.97 Max. :23304.94
str(data)
## 'data.frame': 50 obs. of 8 variables:
## $ Name : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ PovertyPct : num 16.8 11.1 14.1 16.8 12.8 9.7 10.3 12.2 13.7 14.5 ...
## $ UnemploymentPct : num 3 6.1 4.7 3.5 4 2.8 3.7 3.8 3.1 3.4 ...
## $ MedianHouseholdIncome: num 49881 74912 59079 47094 75250 ...
## $ FIPS. : chr "01000" "02000" "04000" "05000" ...
## $ EducationPct : num 24.9 29.2 28.9 22.6 33.3 ...
## $ EducationPct_Urban : num 28 31.8 29.6 26.8 33.5 ...
## $ TotalExports1Million : num 1348.8 16.6 1466.7 3013.5 23304.9 ...
library(tidyr)
library(ggplot2)
library(gstat)
## Warning: package 'gstat' was built under R version 4.1.3
hist(data$EducationPct_Urban,main="Histogram of urban Education Percentage",)
hist(data$PovertyPct,main="Histogram of Poverty",)
hist(data$UnemploymentPct,main="Histogram of Unemployment",)
hist(data$MedianHouseholdIncome, main = "Histogram of Median Household Income")
hist(data$EducationPct, main = "Histogram of Education Pct")
hist(data$TotalExports1Million,main="Histogram of Total Exports",)
Next, we got co-ordinates of the state and merged it to the above data to convert it into spacial data. Then we plotted all the variables across the map of USA.From the plots, we can conclude Califormania has the highest exports and education percent is higher in the northern sates compared to southern states.
state = read.csv("D:/OneDrive - Harrisburg University/HU-Analytics/555-Spatial-Analytics/Project/State.csv")
colnames(state) <- c("state", "lat", "long", "Name")
spatialData <- merge(data, state, by='Name')
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.6 v dplyr 1.0.8
## v readr 2.1.1 v stringr 1.4.0
## v purrr 0.3.4 v forcats 0.5.1
## Warning: package 'forcats' was built under R version 4.1.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.1.3
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(usmap)
## Warning: package 'usmap' was built under R version 4.1.3
library(ggplot2)
plot_usmap(data = spatialData, values = "PovertyPct", color = "yellow") +
scale_fill_continuous(low = "white", high = "yellow",name = "PovertyPct", label = scales::comma) + labs(title = "Poverty Pct")+
theme(legend.position = "left")
plot_usmap(data = spatialData, values = "UnemploymentPct", color = "orange") +
scale_fill_continuous(low = "white", high = "orange",name = "UnemploymentPct", label = scales::comma) + labs(title = "UnemploymentPct")+
theme(legend.position = "left")
plot_usmap(data = spatialData, values = "MedianHouseholdIncome", color = "pink") +
scale_fill_continuous(low = "white", high = "pink",name = "MedianHouseholdIncome", label = scales::comma) + labs(title = "MedianHouseholdIncome")+
theme(legend.position = "left")
plot_usmap(data = spatialData, values = "EducationPct", color = "orange") +
scale_fill_continuous(low = "white", high = "orange",name = "EducationPct", label = scales::comma) + labs(title = "EducationPct")+
theme(legend.position = "left")
plot_usmap(data = spatialData, values = "EducationPct_Urban", color = "green") +
scale_fill_continuous(low = "white", high = "green",name = "EducationPct_Urban", label = scales::comma) + labs(title = "EducationPct_Urban")+
theme(legend.position = "left")
plot_usmap(data = spatialData, values = "TotalExports1Million", color = "red") +
scale_fill_continuous(low = "white", high = "red",name = "TotalExports1Million", label = scales::comma) + labs(title = "TotalExports1Million")+
theme(legend.position = "left")
Then I looked at a variogram, which shows the variation in data points as a function of distance. It provides a description of the data’s spatial continuity.From the plot we can see some of the variables have spatial continuity while its missing in some of the variables.
library(ape)
## Warning: package 'ape' was built under R version 4.1.3
library(spdep)
## Warning: package 'spdep' was built under R version 4.1.3
## Loading required package: spData
## Warning: package 'spData' was built under R version 4.1.3
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
## Loading required package: sf
## Warning: package 'sf' was built under R version 4.1.3
## Linking to GEOS 3.9.1, GDAL 3.2.1, PROJ 7.2.1; sf_use_s2() is TRUE
## Registered S3 method overwritten by 'spdep':
## method from
## plot.mst ape
coordinates(spatialData) <- c("long", "lat")
##Poverty
vario_state.dir=variogram(PovertyPct~1,
data=spatialData,
alpha = c(0, 45, 90, 135))
plot(vario_state.dir)
vario_state=variogram(PovertyPct~1,data = spatialData)
plot(vario_state)
## Unemployment
vario_state.dir=variogram(UnemploymentPct~1,
data=spatialData,
alpha = c(0, 45, 90, 135))
plot(vario_state.dir)
vario_state=variogram(UnemploymentPct~1,data = spatialData)
plot(vario_state)
## MedianHouseholdIncome
vario_state.dir=variogram(MedianHouseholdIncome~1,
data=spatialData,
alpha = c(0, 45, 90, 135))
plot(vario_state.dir)
vario_state=variogram(MedianHouseholdIncome~1,data = spatialData)
plot(vario_state)
## EducationPct
vario_state.dir=variogram(EducationPct~1,
data=spatialData,
alpha = c(0, 45, 90, 135))
plot(vario_state.dir)
vario_state=variogram(EducationPct~1,data = spatialData)
plot(vario_state)
## EducationPct_Urban
vario_state.dir=variogram(EducationPct_Urban~1,
data=spatialData,
alpha = c(0, 45, 90, 135))
plot(vario_state.dir)
vario_state=variogram(EducationPct_Urban~1,data = spatialData)
plot(vario_state)
## TotalExports1Million
vario_state.dir=variogram(TotalExports1Million~1,
data=spatialData,
alpha = c(0, 45, 90, 135))
plot(vario_state.dir)
vario_state=variogram(TotalExports1Million~1,data = spatialData)
plot(vario_state)
I ran Moran test-1 to test for spatial conclusion and from the test results we accepted the null hypothesis if p-value is greater than 0.05, else we would say there is spatial correlation at 0.05 significance.
spatialData1 = spatialData[,-c(1,5,9)]
summary(spatialData1)
## Object of class SpatialPointsDataFrame
## Coordinates:
## min max
## long -155.66586 -69.44547
## lat 19.89868 63.58875
## Is projected: NA
## proj4string : [NA]
## Number of points: 50
## Data attributes:
## PovertyPct UnemploymentPct MedianHouseholdIncome EducationPct
## Min. : 7.60 Min. :2.400 Min. :44038 Min. :20.26
## 1st Qu.:10.70 1st Qu.:3.025 1st Qu.:55294 1st Qu.:27.20
## Median :12.65 Median :3.500 Median :60352 Median :30.01
## Mean :12.80 Mean :3.602 Mean :61648 Mean :30.65
## 3rd Qu.:14.10 3rd Qu.:4.000 3rd Qu.:69633 3rd Qu.:33.27
## Max. :19.80 Max. :6.100 Max. :83076 Max. :42.91
## EducationPct_Urban TotalExports1Million
## Min. :22.94 Min. : 16.64
## 1st Qu.:29.59 1st Qu.: 458.59
## Median :32.93 Median : 1838.14
## Mean :33.08 Mean : 2778.31
## 3rd Qu.:36.35 3rd Qu.: 3537.90
## Max. :43.97 Max. :23304.94
#coordinates(spatialData1) = ~lat+long
#RDT_data_nb<-dnearneigh(spatialData1,0,1) #this is different for all types of spatial data (point,line,polygon)
#moran.test(spatialData1$PovertyPct, nb2listw(RDT_data_nb,style="W", zero.policy = TRUE))
#moran.test(spatialData1$UnemploymentPct, nb2listw(RDT_data_nb,style="W"))
#moran.test(spatialData1$MedianHouseholdIncome, nb2listw(RDT_data_nb,style="W"))
#moran.test(spatialData1$EducationPct, nb2listw(RDT_data_nb,style="W"))
#moran.test(spatialData1$EducationPct_Urban, nb2listw(RDT_data_nb,style="W"))
#moran.test(spatialData1$TotalExports1Million, nb2listw(RDT_data_nb,style="W"))
We bould a regression model that predicts from the variables and from the results we can see all the variables play significant role in predicting the poverty index.
# spatial lag model.
library(spatialreg )
## Warning: package 'spatialreg' was built under R version 4.1.3
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Attaching package: 'spatialreg'
## The following objects are masked from 'package:spdep':
##
## get.ClusterOption, get.coresOption, get.mcOption,
## get.VerboseOption, get.ZeroPolicyOption, set.ClusterOption,
## set.coresOption, set.mcOption, set.VerboseOption,
## set.ZeroPolicyOption
f1 <- PovertyPct ~ MedianHouseholdIncome + UnemploymentPct + EducationPct + EducationPct_Urban + TotalExports1Million
data = read.xlsx("D:/OneDrive - Harrisburg University/HU-Analytics/555-Spatial-Analytics/Project/StatesPovertyIndex.xlsx",sheetIndex = 1)
state = read.csv("D:/OneDrive - Harrisburg University/HU-Analytics/555-Spatial-Analytics/Project/State.csv")
colnames(state) <- c("state", "lat", "long", "Name")
spatialData <- merge(data, state, by='Name')
x_coords <- spatialData$lat
y_coords <- spatialData$long
poly1 <- sp::Polygon(cbind(x_coords,y_coords))
firstPoly <- sp::Polygons(list(poly1), ID = "A")
firstSpatialPoly <- sp::SpatialPolygons(list(firstPoly))
#poly.data <- Polygon(spatialData1)
#poly.data <- SpatialPolygons(list(Polygons(list(poly.data), ID = 1)))
#nb <- poly2nb(firstSpatialPoly)
#lw <- nb2listw(nb)
#m1s = lagsarlm(f1, data=spatialData1, lw, tol.solve=1.0e-30)
#summary(m1s)
m1 <- lm(f1, data = spatialData)
summary(m1)
##
## Call:
## lm(formula = f1, data = spatialData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7157 -1.2667 0.1958 1.0448 4.7887
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.524e+01 2.658e+00 9.496 3.22e-12 ***
## MedianHouseholdIncome -3.458e-05 3.101e-05 -1.115 0.271
## UnemploymentPct 3.354e-01 3.476e-01 0.965 0.340
## EducationPct -3.491e-01 1.536e-01 -2.273 0.028 *
## EducationPct_Urban -2.489e-02 1.539e-01 -0.162 0.872
## TotalExports1Million 1.157e-06 7.034e-05 0.016 0.987
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.842 on 44 degrees of freedom
## Multiple R-squared: 0.6035, Adjusted R-squared: 0.5584
## F-statistic: 13.39 on 5 and 44 DF, p-value: 5.979e-08