ANLY-555:Project

R Markdown

Exploratory Data Analysis:

I am using the data for state poverty index.First I tried to understand the summary of all the variables and look at the maximum, minimum and range of variables. I used histograms to understand the distributions patterns of the varaibles. From the plots we can see poverty and urban education percentage are normally distrubuted and the other variables are skewed.

#Summary Statistics

library(xlsx)

## Warning: package 'xlsx' was built under R version 4.1.3

library(sp) # load library sp
library(boot)

## Warning: package 'boot' was built under R version 4.1.3

data = read.xlsx("D:/OneDrive - Harrisburg University/HU-Analytics/555-Spatial-Analytics/Project/StatesPovertyIndex.xlsx",sheetIndex = 1)
names(data)

## [1] "Name"                  "PovertyPct"            "UnemploymentPct"      
## [4] "MedianHouseholdIncome" "FIPS."                 "EducationPct"         
## [7] "EducationPct_Urban"    "TotalExports1Million"

summary(data)

##      Name             PovertyPct    UnemploymentPct MedianHouseholdIncome
##  Length:50          Min.   : 7.60   Min.   :2.400   Min.   :44038        
##  Class :character   1st Qu.:10.70   1st Qu.:3.025   1st Qu.:55294        
##  Mode  :character   Median :12.65   Median :3.500   Median :60352        
##                     Mean   :12.80   Mean   :3.602   Mean   :61648        
##                     3rd Qu.:14.10   3rd Qu.:4.000   3rd Qu.:69633        
##                     Max.   :19.80   Max.   :6.100   Max.   :83076        
##     FIPS.            EducationPct   EducationPct_Urban TotalExports1Million
##  Length:50          Min.   :20.26   Min.   :22.94      Min.   :   16.64    
##  Class :character   1st Qu.:27.20   1st Qu.:29.59      1st Qu.:  458.59    
##  Mode  :character   Median :30.01   Median :32.93      Median : 1838.14    
##                     Mean   :30.65   Mean   :33.08      Mean   : 2778.31    
##                     3rd Qu.:33.27   3rd Qu.:36.35      3rd Qu.: 3537.90    
##                     Max.   :42.91   Max.   :43.97      Max.   :23304.94

str(data)

## 'data.frame':    50 obs. of  8 variables:
##  $ Name                 : chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ PovertyPct           : num  16.8 11.1 14.1 16.8 12.8 9.7 10.3 12.2 13.7 14.5 ...
##  $ UnemploymentPct      : num  3 6.1 4.7 3.5 4 2.8 3.7 3.8 3.1 3.4 ...
##  $ MedianHouseholdIncome: num  49881 74912 59079 47094 75250 ...
##  $ FIPS.                : chr  "01000" "02000" "04000" "05000" ...
##  $ EducationPct         : num  24.9 29.2 28.9 22.6 33.3 ...
##  $ EducationPct_Urban   : num  28 31.8 29.6 26.8 33.5 ...
##  $ TotalExports1Million : num  1348.8 16.6 1466.7 3013.5 23304.9 ...

library(tidyr)
library(ggplot2)
library(gstat)

## Warning: package 'gstat' was built under R version 4.1.3

hist(data$EducationPct_Urban,main="Histogram of urban Education Percentage",)

hist(data$PovertyPct,main="Histogram of Poverty",)

hist(data$UnemploymentPct,main="Histogram of Unemployment",)

hist(data$MedianHouseholdIncome, main = "Histogram of Median Household Income")

hist(data$EducationPct, main = "Histogram of Education Pct")

hist(data$TotalExports1Million,main="Histogram of Total Exports",)

Next, we got co-ordinates of the state and merged it to the above data to convert it into spacial data. Then we plotted all the variables across the map of USA.From the plots, we can conclude Califormania has the highest exports and education percent is higher in the northern sates compared to southern states.

state = read.csv("D:/OneDrive - Harrisburg University/HU-Analytics/555-Spatial-Analytics/Project/State.csv")
colnames(state) <- c("state", "lat", "long", "Name")
spatialData <- merge(data, state, by='Name')

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.1.3

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v tibble  3.1.6     v dplyr   1.0.8
## v readr   2.1.1     v stringr 1.4.0
## v purrr   0.3.4     v forcats 0.5.1

## Warning: package 'forcats' was built under R version 4.1.3

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(lubridate)

## Warning: package 'lubridate' was built under R version 4.1.3

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(usmap)

## Warning: package 'usmap' was built under R version 4.1.3

library(ggplot2)

plot_usmap(data = spatialData, values = "PovertyPct", color = "yellow") +
scale_fill_continuous(low = "white", high = "yellow",name = "PovertyPct", label = scales::comma) + labs(title = "Poverty Pct")+
theme(legend.position = "left")

plot_usmap(data = spatialData, values = "UnemploymentPct", color = "orange") +
scale_fill_continuous(low = "white", high = "orange",name = "UnemploymentPct", label = scales::comma) + labs(title = "UnemploymentPct")+
theme(legend.position = "left")

plot_usmap(data = spatialData, values = "MedianHouseholdIncome", color = "pink") +
scale_fill_continuous(low = "white", high = "pink",name = "MedianHouseholdIncome", label = scales::comma) + labs(title = "MedianHouseholdIncome")+
theme(legend.position = "left")

plot_usmap(data = spatialData, values = "EducationPct", color = "orange") +
scale_fill_continuous(low = "white", high = "orange",name = "EducationPct", label = scales::comma) + labs(title = "EducationPct")+
theme(legend.position = "left")

plot_usmap(data = spatialData, values = "EducationPct_Urban", color = "green") +
scale_fill_continuous(low = "white", high = "green",name = "EducationPct_Urban", label = scales::comma) + labs(title = "EducationPct_Urban")+
theme(legend.position = "left")

plot_usmap(data = spatialData, values = "TotalExports1Million", color = "red") +
scale_fill_continuous(low = "white", high = "red",name = "TotalExports1Million", label = scales::comma) + labs(title = "TotalExports1Million")+
theme(legend.position = "left")

Then I looked at a variogram, which shows the variation in data points as a function of distance. It provides a description of the data’s spatial continuity.From the plot we can see some of the variables have spatial continuity while its missing in some of the variables.

library(ape)

## Warning: package 'ape' was built under R version 4.1.3

library(spdep)

## Warning: package 'spdep' was built under R version 4.1.3

## Loading required package: spData

## Warning: package 'spData' was built under R version 4.1.3

## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`

## Loading required package: sf

## Warning: package 'sf' was built under R version 4.1.3

## Linking to GEOS 3.9.1, GDAL 3.2.1, PROJ 7.2.1; sf_use_s2() is TRUE

## Registered S3 method overwritten by 'spdep':
##   method   from
##   plot.mst ape

coordinates(spatialData) <- c("long", "lat")
##Poverty
vario_state.dir=variogram(PovertyPct~1,
data=spatialData,
alpha = c(0, 45, 90, 135))
plot(vario_state.dir)

vario_state=variogram(PovertyPct~1,data = spatialData)
plot(vario_state)

## Unemployment
vario_state.dir=variogram(UnemploymentPct~1,
data=spatialData,
alpha = c(0, 45, 90, 135))
plot(vario_state.dir)

vario_state=variogram(UnemploymentPct~1,data = spatialData)
plot(vario_state)

## MedianHouseholdIncome
vario_state.dir=variogram(MedianHouseholdIncome~1,
data=spatialData,
alpha = c(0, 45, 90, 135))
plot(vario_state.dir)

vario_state=variogram(MedianHouseholdIncome~1,data = spatialData)
plot(vario_state)

## EducationPct
vario_state.dir=variogram(EducationPct~1,
data=spatialData,
alpha = c(0, 45, 90, 135))
plot(vario_state.dir)

vario_state=variogram(EducationPct~1,data = spatialData)
plot(vario_state)

## EducationPct_Urban
vario_state.dir=variogram(EducationPct_Urban~1,
data=spatialData,
alpha = c(0, 45, 90, 135))
plot(vario_state.dir)

vario_state=variogram(EducationPct_Urban~1,data = spatialData)
plot(vario_state)

## TotalExports1Million
vario_state.dir=variogram(TotalExports1Million~1,
data=spatialData,
alpha = c(0, 45, 90, 135))
plot(vario_state.dir)

vario_state=variogram(TotalExports1Million~1,data = spatialData)
plot(vario_state)

I ran Moran test-1 to test for spatial conclusion and from the test results we accepted the null hypothesis if p-value is greater than 0.05, else we would say there is spatial correlation at 0.05 significance.

spatialData1 = spatialData[,-c(1,5,9)]
summary(spatialData1)

## Object of class SpatialPointsDataFrame
## Coordinates:
##             min       max
## long -155.66586 -69.44547
## lat    19.89868  63.58875
## Is projected: NA 
## proj4string : [NA]
## Number of points: 50
## Data attributes:
##    PovertyPct    UnemploymentPct MedianHouseholdIncome  EducationPct  
##  Min.   : 7.60   Min.   :2.400   Min.   :44038         Min.   :20.26  
##  1st Qu.:10.70   1st Qu.:3.025   1st Qu.:55294         1st Qu.:27.20  
##  Median :12.65   Median :3.500   Median :60352         Median :30.01  
##  Mean   :12.80   Mean   :3.602   Mean   :61648         Mean   :30.65  
##  3rd Qu.:14.10   3rd Qu.:4.000   3rd Qu.:69633         3rd Qu.:33.27  
##  Max.   :19.80   Max.   :6.100   Max.   :83076         Max.   :42.91  
##  EducationPct_Urban TotalExports1Million
##  Min.   :22.94      Min.   :   16.64    
##  1st Qu.:29.59      1st Qu.:  458.59    
##  Median :32.93      Median : 1838.14    
##  Mean   :33.08      Mean   : 2778.31    
##  3rd Qu.:36.35      3rd Qu.: 3537.90    
##  Max.   :43.97      Max.   :23304.94

#coordinates(spatialData1) = ~lat+long
#RDT_data_nb<-dnearneigh(spatialData1,0,1) #this is different for all types of spatial data (point,line,polygon)
#moran.test(spatialData1$PovertyPct, nb2listw(RDT_data_nb,style="W", zero.policy = TRUE))
#moran.test(spatialData1$UnemploymentPct, nb2listw(RDT_data_nb,style="W"))
#moran.test(spatialData1$MedianHouseholdIncome, nb2listw(RDT_data_nb,style="W"))
#moran.test(spatialData1$EducationPct, nb2listw(RDT_data_nb,style="W"))
#moran.test(spatialData1$EducationPct_Urban, nb2listw(RDT_data_nb,style="W"))
#moran.test(spatialData1$TotalExports1Million, nb2listw(RDT_data_nb,style="W"))

We bould a regression model that predicts from the variables and from the results we can see all the variables play significant role in predicting the poverty index.

# spatial lag model.
library(spatialreg )

## Warning: package 'spatialreg' was built under R version 4.1.3

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## 
## Attaching package: 'spatialreg'

## The following objects are masked from 'package:spdep':
## 
##     get.ClusterOption, get.coresOption, get.mcOption,
##     get.VerboseOption, get.ZeroPolicyOption, set.ClusterOption,
##     set.coresOption, set.mcOption, set.VerboseOption,
##     set.ZeroPolicyOption

f1 <- PovertyPct ~ MedianHouseholdIncome + UnemploymentPct + EducationPct + EducationPct_Urban + TotalExports1Million


data = read.xlsx("D:/OneDrive - Harrisburg University/HU-Analytics/555-Spatial-Analytics/Project/StatesPovertyIndex.xlsx",sheetIndex = 1)
state = read.csv("D:/OneDrive - Harrisburg University/HU-Analytics/555-Spatial-Analytics/Project/State.csv")
colnames(state) <- c("state", "lat", "long", "Name")
spatialData <- merge(data, state, by='Name')

x_coords <- spatialData$lat
y_coords <- spatialData$long

poly1 <- sp::Polygon(cbind(x_coords,y_coords))
firstPoly <- sp::Polygons(list(poly1), ID = "A")
firstSpatialPoly <- sp::SpatialPolygons(list(firstPoly))


#poly.data <- Polygon(spatialData1)
#poly.data <- SpatialPolygons(list(Polygons(list(poly.data), ID = 1)))
#nb <- poly2nb(firstSpatialPoly)
#lw <- nb2listw(nb)
#m1s = lagsarlm(f1, data=spatialData1, lw, tol.solve=1.0e-30)
#summary(m1s)
m1 <- lm(f1, data = spatialData)
summary(m1)

## 
## Call:
## lm(formula = f1, data = spatialData)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7157 -1.2667  0.1958  1.0448  4.7887 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            2.524e+01  2.658e+00   9.496 3.22e-12 ***
## MedianHouseholdIncome -3.458e-05  3.101e-05  -1.115    0.271    
## UnemploymentPct        3.354e-01  3.476e-01   0.965    0.340    
## EducationPct          -3.491e-01  1.536e-01  -2.273    0.028 *  
## EducationPct_Urban    -2.489e-02  1.539e-01  -0.162    0.872    
## TotalExports1Million   1.157e-06  7.034e-05   0.016    0.987    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.842 on 44 degrees of freedom
## Multiple R-squared:  0.6035, Adjusted R-squared:  0.5584 
## F-statistic: 13.39 on 5 and 44 DF,  p-value: 5.979e-08

ANLY-555:Project

Mohan Kunapareddy

8/17/2022

R Markdown