#-----------------------------#
#Load Packages & Prepare Data-#
#-----------------------------#
library(kableExtra)
library(knitr)
library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(magrittr)
library(ggmap)
library(maps)
library(mapdata)
library(nlme)
library(plotly)
library(ggthemes)
library(stargazer)
library(visreg)
library(texreg)
#----------------------------#
#----------------------------#
#-------Map Data-------------#
#----------------------------#
#----------------------------#
#Load Base Map Data
statemap <- map_data("state")
#Creating a list of State Names & State Abbreviations
#Some datasets only have full state names, some only have abbreviations.
#This will allow me to merge those dataframes together
#Social Explorer has the full state name (New York, New Jersey, Connecticut, etc...)
#Missing Person Data has the State Abbreviations (NY, NJ, CT)
#Map data has the full state name (New York, New Jersey, Connecticut, etc...)
stateabb = c("AK", "AL", "AR",
"AZ", "CA", "CO",
"CT", "DC", "DE",
"FL", "GA", "HI",
"IA", "ID", "IL", "IN", "KS",
"KY", "LA", "MA",
"MD", "ME", "MI", "MN",
"MO", "MS", "MT",
"NC", "ND","NE",
"NH", "NJ", "NM",
"NV", "NY", "OH","OK", "OR",
"PA", "PR", "RI",
"SC", "SD","TN",
"TX", "UT", "VA", "VT","WA",
"WI", "WV", "WY")
statefull = tolower(c("Alaska","Alabama","Arkansas",
"Arizona","California","Colorado",
"Connecticut","District of Columbia","Delaware",
"Florida" , "Georgia" ,"Hawaii",
"Iowa","Idaho","Illinois","Indiana","Kansas",
"Kentucky","Louisiana","Massachusetts",
"Maryland","Maine" ,"Michigan","Minnesota",
"Missouri","Mississippi","Montana",
"North Carolina","North Dakota","Nebraska",
"New Hampshire" , "New Jersey" , "New Mexico",
"Nevada","New York","Ohio","Oklahoma" ,"Oregon",
"Pennsylvania" , "Puerto Rico","Rhode Island",
"South Carolina", "South Dakota" ,"Tennessee" ,
"Texas" , "Utah", "Virginia","Vermont" ,"Washington",
"Wisconsin", "West Virginia" , "Wyoming"))
statelist <- data.frame(statefull, stateabb)
# Add the abbreviations to the statemap data using "merge"
states <-merge(statemap,statelist,
all.x=TRUE, all.y=FALSE,
by.x="region", by.y="statefull")
#----------------------------#
#----------------------------#
#----Missing children data---#
#----------------------------#
#----------------------------#
missingpersons <- read_csv("/Users/Brett/Library/Mobile Documents/com~apple~CloudDocs/All Files/Education/Data Analysis Masters/SPR17.Advanced Analytics/Homework/Missing Children/MediaReadyActiveCases_03082017.csv")%>%
mutate(missingdate = as.Date(missingreporteddate, format = "%m/%d/%y %H:%M"),
birthdate = as.Date(birthdate, format = "%m/%d/%y %H:%M"),
daysmissing = Sys.Date()-missingdate,
white = factor(ifelse(race == "White","White","Not White")),
male = ifelse(sex=="Male",1,0),
missingfromstate = as.factor(missingfromstate),
race = factor(race),
age = Sys.Date()-birthdate)
missingbysex<- missingpersons%>%
select(missingfromstate, sex)%>%
group_by(missingfromstate, sex)%>%
summarize(n=n())%>%
spread(sex, n)%>%
rename(state = missingfromstate,
missingfemales = Female,
missingmales = Male)
#----------------------------#
#---Social Explorer Data-----#
#----State Populations-------#
#----------------------------#
statepop <- read_csv("/Users/Brett/Library/Mobile Documents/com~apple~CloudDocs/All Files/Education/Data Analysis Masters/SPR17.Advanced Analytics/Homework/Missing Children/R11374249_SL040.csv")%>%
rename( #Label Variables with descriptive names
statename = Geo_NAME,
totalpop = SE_T001_001,
popdensity = SE_T001_002,
sextotalpop = SE_T002_001, #Label Male & Female Variables
malepop = SE_T002_002,
femalepop = SE_T002_003,
racetotalpop = SE_T019_001, #Label Race Variables
maleracetotalpop= SE_T019_002,
whitemale = SE_T019_003,
blackmale = SE_T019_004,
nativemale = SE_T019_005,
asianmale = SE_T019_006,
pacifislamale = SE_T019_007,
mixedmale = SE_T019_008,
femracetotalpop = SE_T019_009,
whitefem = SE_T019_010,
blackfem = SE_T019_011,
nativefem = SE_T019_012,
asianfem = SE_T019_013,
pacificislafem = SE_T019_014,
mixedfem = SE_T019_015)%>%
select(statename, totalpop, popdensity,malepop, femalepop)%>%
mutate(statename=tolower(statename)) #Lowercase state name to match state map data
#----------Merge-------------#
#DF 1 = State Abbreviations--#
#DF 2 = Social Explorer Data-#
#----------------------------#
#(This will allow us to merge Soc Explorer with Missing Person data)
sexpopulation2 <- merge(statepop, statelist,
all.x = TRUE,
all.y = TRUE,
by.x = "statename",
by.y = "statefull")%>%
select(statename, stateabb,totalpop, malepop, femalepop,popdensity)
#----------Merge----------------#
#DF 1 = Missing Person----------#
#DF 2 = Soc Explorer Data-------#
#-------------------------------#
# Step 1: Social Explorer + Missing Person Data
# Merge dataframes, match on state abbreviations
SocAndMissing <- merge(sexpopulation2, missingbysex,
all.x = TRUE,
all.y = FALSE,
by.x = "stateabb",
by.y="state")%>%
mutate(
missingrate = (missingmales+missingfemales)/totalpop*100000, #Calculate missing person rate for each sex.
malemissingrate = ((missingmales)/(malepop))*100000,
femalemissingrate = ((missingfemales)/(femalepop))*100000,
missingratio = malemissingrate/femalemissingrate,
dominantmissingsex = ifelse(malemissingrate>femalemissingrate,
"male",
"female"))
#Multiplied % by 1000 for simple interpretation.
#Divide by 1000 for actual % rates at the end.
#----------Merge---------------------------------#
#DF 1 = State Map Data---------------------------#
#DF 2 = Soc Explorer & Missing Person Data-------#
#------------------------------------------------#
missingbysexfinal <- merge(states,SocAndMissing,
all.x = TRUE,
all.y = FALSE,
by.x = "region",
by.y="statename")
SocAndMissing%>%
head()%>%
kable(format="html", table.attr = "style='width:40%;'", align="l")%>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| stateabb | statename | totalpop | malepop | femalepop | popdensity | missingfemales | missingmales | missingrate | malemissingrate | femalemissingrate | missingratio | dominantmissingsex |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| AK | alaska | 738432 | 389025 | 349407 | 1.294046 | 4 | 5 | 1.218799 | 1.2852644 | 1.1447968 | 1.1227010 | male |
| AL | alabama | 4858979 | 2353184 | 2505795 | 95.939421 | 16 | 6 | 0.452770 | 0.2549737 | 0.6385199 | 0.3993199 | female |
| AR | arkansas | 2978204 | 1462856 | 1515348 | 57.234949 | 14 | 7 | 0.705123 | 0.4785160 | 0.9238802 | 0.5179416 | female |
| AZ | arizona | 6828065 | 3391490 | 3436575 | 60.109918 | 43 | 28 | 1.039826 | 0.8255958 | 1.2512458 | 0.6598191 | female |
| CA | california | 39144818 | 19443068 | 19701750 | 251.269840 | 293 | 171 | 1.185342 | 0.8794908 | 1.4871775 | 0.5913825 | female |
| CO | colorado | 5456574 | 2743763 | 2712811 | 52.648896 | 43 | 27 | 1.282856 | 0.9840500 | 1.5850717 | 0.6208236 | female |
#Heatmap of missing person rate
map1 <- ggplot() +
geom_polygon(data = missingbysexfinal,
aes(x = long, y = lat, fill =missingrate/1000,
group = group, text = region),color = "grey")+
coord_fixed(1.3)+
xlab("Missing Person Rate by State") + ylab("")+
theme_map()
ggplotly(map1, width = 800, height = 400, tooltip = c("region"))
#Sort States by Missing Rate
SocAndMissing<- transform(SocAndMissing,
statename = reorder(statename, missingrate))
ggplot(data = SocAndMissing, aes(x=statename, y=missingrate/1000, label = sprintf("%2.4f %%", missingrate/1000)))+
geom_bar(aes(x=statename, y=missingrate/1000, fill =missingrate/1000),stat = "identity")+
coord_flip()+
xlab("Missing Rate by State")+
geom_text(position = position_dodge(1), size=5, angle=0, hjust = -.15,color = "black")+
theme_fivethirtyeight()+ theme(axis.text=element_text(size=16))
missingpersons%>%
select(missingfromstate, sex)%>%
group_by(sex)%>%
summarize(missing=n())%>%
ggplot()+
geom_col(aes(x=sex, y=missing, fill=sex))+
labs(title="Number of Missing Children by Sex", x="",y="# Missing")+
theme(legend.position="none")
To more precisely measure the disparity between missing boys & girls, we can plot the missing ratio for each state on a map.
Missing Ratio = (# missing boys) / (# missing girls)
The map shows a heavy bias towards girls.
map3<- ggplot() +
geom_polygon(data = missingbysexfinal,
aes(x = long, y = lat, fill =missingratio, group = group, text = region),color = "grey")+
scale_fill_gradient(low = "#FA8072", high = "darkblue",
space = "Lab", na.value = "grey50", guide = "colourbar")+
coord_fixed(1.3)+
xlab("Missing Ratio: Blue = boys | Coral = girls") + ylab("")+
theme_map()
ggplotly(map3,width = 800, height = 400, tooltip = c("region"))
#Sort States by Missing Ratio
SocAndMissing<- transform(SocAndMissing,
statename = reorder(statename, missingratio))%>%
filter(!is.na(dominantmissingsex))
ggplot()+
geom_bar(data = SocAndMissing, aes(x=statename, y=missingratio, fill =missingratio),stat="identity")+
geom_text(data = SocAndMissing, aes(x=statename, y=missingratio, label=round(missingratio,2)),
position = position_dodge(1), size=5, angle=0, hjust = -.15,color = "black")+
scale_fill_gradient(low = "#FA8072", high = "darkblue", space = "Lab",
na.value = "grey50", guide = "colourbar")+
coord_flip()+
xlab("State")+
theme_fivethirtyeight()+
theme(axis.text=element_text(size=15 ))
I will run a linear regression to evaluate the difference in missing person rates between males and females. This is an ecological analysis, as we are evaluating the aggregate number of missing people by sex for each state. The units of analysis are aggregates, not individual responses.
#Prepare State-level data for Ecological Analysis
SocAndMissing2 <- missingbysex%>%
rename(male=missingmales,
female=missingfemales)%>%
gather(sex,missing, male:female)
SocAndMissing2 <- merge(SocAndMissing2,sexpopulation2,
all.x = TRUE,
all.y = FALSE,
by.x = "state",
by.y="stateabb")%>%
mutate(
missingrate = ifelse(sex == "male",
(missing/malepop),
(missing / femalepop)))
SocAndMissing2%>%
head()%>%
kable(format="html", table.attr = "style='width:40%;'", align="l")%>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| state | sex | missing | statename | totalpop | malepop | femalepop | popdensity | missingrate |
|---|---|---|---|---|---|---|---|---|
| AK | male | 5 | alaska | 738432 | 389025 | 349407 | 1.294046 | 1.29e-05 |
| AK | female | 4 | alaska | 738432 | 389025 | 349407 | 1.294046 | 1.14e-05 |
| AL | female | 16 | alabama | 4858979 | 2353184 | 2505795 | 95.939421 | 6.40e-06 |
| AL | male | 6 | alabama | 4858979 | 2353184 | 2505795 | 95.939421 | 2.50e-06 |
| AR | female | 14 | arkansas | 2978204 | 1462856 | 1515348 | 57.234949 | 9.20e-06 |
| AR | male | 7 | arkansas | 2978204 | 1462856 | 1515348 | 57.234949 | 4.80e-06 |
Model 1: Effect of Sex on Missing Rate
Regression analysis shows boys experiencing a missing rate that is .0004(.04%) lower than that of girls.
Relative to the population, a .04% decrease in the missing person rate for males may not seem like much. However when we consider that this rate represents a 40% ((.10-.04)/.10) disparity between the missing rates experienced by males and females, it is clear that there is a striking difference.
Model 2: Effect of Sex and Population Density Interaction on Missing Rate
missingreg <- lm(missingrate*100 ~ sex, data = SocAndMissing2)
missingreg2 <- lm(missingrate*100 ~ sex*popdensity, data = SocAndMissing2)
htmlreg(list(missingreg, missingreg2), type="html",digits=4)
| Model 1 | Model 2 | ||
|---|---|---|---|
| (Intercept) | 0.0010*** | 0.0010*** | |
| (0.0001) | (0.0001) | ||
| sexmale | -0.0004*** | -0.0004*** | |
| (0.0001) | (0.0001) | ||
| popdensity | 0.0000 | ||
| (0.0000) | |||
| sexmale:popdensity | -0.0000 | ||
| (0.0000) | |||
| R2 | 0.1908 | 0.1939 | |
| Adj. R2 | 0.1826 | 0.1690 | |
| Num. obs. | 101 | 101 | |
| RMSE | 0.0004 | 0.0004 | |
| p < 0.001, p < 0.01, p < 0.05 | |||
visreg(missingreg)
(Breheny and Burchett 2013)
The variable of interest in the analysis above - missingrate - can not be evaluated in a complete pooling, no-pooling, or partial pooling by random intercept method. This is due to the fact that the missing rate is a state-level variable, and not an individual level variable. We can only conduct an ecological analysis with this data.
Using a generalized least squares fit by maximum likelihood regression analysis we will assess the effect of sex & race on # of days a child is missing. This metric (number of days missing) is an individual-level data point.
We will look at the number of days a child has been missing (DV), and the effects of sex & race. Each observation in our data represents a missing child, including how many days the child has been missing, their sex, their race, what state they are from, and other identifying characteristics.
| childfirstname | childlastname | missingfromstate | daysmissing | white | sex |
|---|---|---|---|---|---|
| Christopher | Abeyta | CO | 11567 days | White | Male |
| Aaron | Anderson | MN | 10525 days | White | Male |
| Taranika | Raymond | LA | 8052 days | Not White | Female |
| Ruben | Felix | ID | 7673 days | Not White | Male |
| Tiffany | Dixon | NY | 9640 days | Not White | Female |
| Jamal | Abdul’Faruq | VA | 10193 days | Not White | Male |
Pooling all respondents together, and evaluating # of days a child has been missing, irrespective of state.
Model 1: Effect of sex on # of days a person is missing
Model 2: Interaction effect of sex & race on # of days a person is missing
Non-White females are estimated to be missing for 1,678 days.
Non-White Males are estimated to be missing for 2,129 days.
White females are estimated to be missing for 3,987 days.
White Males are estimated to be missing for 3,438 days.
missingreg3 <- gls(daysmissing ~ male, data = missingpersons, method = "ML") #Model 1
missingreg4 <- gls(daysmissing ~ male*white, data = missingpersons, method = "ML") #Model 2
htmlreg(list(missingreg3,missingreg4), type = "html", digits=0)
| Model 1 | Model 2 | ||
|---|---|---|---|
| (Intercept) | 2749*** | 1810*** | |
| (86) | (106) | ||
| male | 273 | 451* | |
| (142) | (179) | ||
| whiteWhite | 2309*** | ||
| (167) | |||
| male:whiteWhite | -549* | ||
| (276) | |||
| AIC | 54519 | 54278 | |
| BIC | 54537 | 54307 | |
| Log Likelihood | -27256 | -27134 | |
| Num. obs. | 2834 | 2834 | |
| p < 0.001, p < 0.01, p < 0.05 | |||
Running an regression for each state, and plotting the intercept and slope values so that we can evaluate differences betwen states for the number of days that a child is missing (intercept), and the impact of sex (slope).
Most states show an intercept (missingrate for females) between 1,500 and 3,500 days. This is a wide range, especially if you are a missing person hoping to get back to your family!
dcoef <- missingpersons %>%
group_by(missingfromstate)%>%
do(mod = lm(daysmissing ~ male+age, data=.))
intc <- dcoef %>% do(data.frame(intc = coef(.$mod)[1]))
ggplot(intc, aes(x = intc)) + geom_histogram(fill="darkblue")+xlab("Intercept: # of Days Females are missing")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
While a regression with complete pooling would lead us to believe that males are going missing for more days than females, it is clear that the relationship between sex & number of days missing varies by state. It is actually nearly split down the middle, with the effect for males being positive in some states, and negative in others.
#Plotting Effect of Sex on days missing
coef <- dcoef %>% do(data.frame(sexc = coef(.$mod)[2]))
ggplot(coef, aes(x = sexc)) + geom_histogram(fill="darkred")
Lastly, we will conduct partial pooling by running a Linear mixed-effects model fit by maximum likelihood, with random effects. This will allow us to generate the best possible estimation of missing person rates while accounting for the variation between states.
To allow for variation in “Days Missing” between states
m1_lme <- lme(daysmissing ~ male,
data = missingpersons,
random = ~1|missingfromstate,
method = "ML",
na.action=na.exclude)
#summary(m1_lme)
htmlreg(m1_lme, type = "html", digits=0)
| Model 1 | ||
|---|---|---|
| (Intercept) | 3163*** | |
| (175) | ||
| male | 265 | |
| (140) | ||
| AIC | 54419 | |
| BIC | 54442 | |
| Log Likelihood | -27205 | |
| Num. obs. | 2831 | |
| Num. groups | 54 | |
| p < 0.001, p < 0.01, p < 0.05 | ||
(Henningsen and Toomet 2011)
To Allow for difference in gender effect between states
m2_lme <- lme(daysmissing ~ male,
data = missingpersons,
random = ~sex|missingfromstate,
method = "ML",
control = lmeControl(opt='optim'),
na.action=na.exclude)
#summary(m2_lme)
htmlreg(m2_lme, type = "html",digits=0)
| Model 1 | ||
|---|---|---|
| (Intercept) | 3094*** | |
| (156) | ||
| male | 376* | |
| (159) | ||
| AIC | 54420 | |
| BIC | 54456 | |
| Log Likelihood | -27204 | |
| Num. obs. | 2831 | |
| Num. groups | 54 | |
| p < 0.001, p < 0.01, p < 0.05 | ||
https://data.world/jamesgray/missing-children-in-the-us
Publicly available information from The National Center for Missing and Exploited Children (NCMEC), released as part of the Cloudera Child Finder Hackathon to develop new methods for finding missing children.
<www.socialexplorer.com>
New York City, NY: Social Explorer 2017
http://www.socialexplorer.com/pub/reportdata/HtmlResults.aspx?reportid=R11374249
Population Estimates in addition with Social Explorer tables provide the estimates of population, demographic components of change, and housing units for the 2015 vintage year. As each vintage of estimates includes all years since the most recent decennial census, the latest vintage of data available supersedes all previously-produced estimates for those dates.
Breheny, Patrick, and Woodrow Burchett. 2013. “Visualization of Regression Models Using Visreg.” R Package, 1–15.
Henningsen, Arne, and Ott Toomet. 2011. “MaxLik: A Package for Maximum Likelihood Estimation in R.” Computational Statistics 26 (3). Springer: 443–58.