Read the data in. (Make sure the readxl package is installed).
#install.packages('readxl')
library(readxl)
library(plyr)
jobs <- readxl::read_excel('PANA JOB results.xlsx')
agg <- readxl::read_excel('PANA aggregated results.xlsx')
hea_edu <- readxl::read_excel('PANA HEA_EDU results.xlsx')
hea <- readxl::read_excel('PANA HEA results.xlsx')
How many of these data points are missing?
table(is.na(agg$monthly_income))
##
## FALSE TRUE
## 124 384
Is there a relationship between missingness and cultural group?
table(agg$cultural_group,is.na(agg$monthly_income))
##
## FALSE TRUE
## Ethiopian 87 48
## Karen Burmese 3 114
## Somali 15 67
## Somali Bantu 6 85
## Sudanese 6 49
## Swahili 7 18
Make sure the ggplot2 package is installed!
#install.packages(ggplot2)
library(ggplot2)
ggplot(agg,aes(x=cultural_group,group=cultural_group,y=monthly_income))+geom_boxplot()+xlab('Cultural Group')+ylab('Monthly Income')
Let’s make a table of some statistics for this data.
#install.packages(plyr)
agg_rm <- subset(agg,!is.na(cultural_group))
ddply(agg_rm,.(cultural_group),summarise,
mean_monthly_income=mean(monthly_income,na.rm = T),
min_monthly_income=min(monthly_income,na.rm = T),
max_monthly_income=max(monthly_income,na.rm = T),
N=length(which(!is.na(monthly_income))))
## cultural_group mean_monthly_income min_monthly_income max_monthly_income
## 1 Ethiopian 1810.3793 0 6250
## 2 Karen Burmese 1166.6667 100 2200
## 3 Somali 1881.0667 0 3600
## 4 Somali Bantu 741.6667 0 1300
## 5 Sudanese 1265.5000 600 1733
## 6 Swahili 1322.8571 280 2100
## N
## 1 87
## 2 3
## 3 15
## 4 6
## 5 6
## 6 7
ggplot(agg_rm,aes(x=cultural_group,group=cultural_group,y=monthly_income))+geom_boxplot()+xlab('Cultural Group')+ylab('Monthly Income')
ggplot(agg_rm,aes(x=cultural_group,group=cultural_group,y=monthly_income))+geom_point()+xlab('Cultural Group')+ylab('Monthly Income')
table(agg$year_arrival_usa)
##
## 1980 1982 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996
## 1 3 2 1 1 1 1 5 10 3 1 2 1 3 2
## 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011
## 3 3 5 4 11 4 8 28 11 15 18 15 10 13 9
## 2012 2013 2014 2015
## 6 8 9 8
agg$years_in_us <- 2016 - agg$year_arrival_usa
table(agg$eng_level)
##
## 0 1 2 3 4 5 6 7 8 9 10 11
## 3 90 25 24 27 41 35 43 41 30 119 7
ggplot(agg,aes(x=eng_level))+geom_histogram()
agg_rm <- subset(agg,cultural_group %in% c("Ethiopian", "Karen Burmese","Somali","Swahili"))
ggplot(agg_rm,aes(group=cultural_group,x=years_in_us,y=eng_level,color=cultural_group))+geom_jitter()+facet_grid(cultural_group ~ .)+xlab('Years in US')+ylab('English Fluency')
sudan <- subset(agg,cultural_group == "Sudanese")
The Sudanese group is missing the year of arrival:
table(is.na(sudan$year_arrival_usa))
##
## FALSE TRUE
## 1 54
We can still look at the spread of English:
ggplot(sudan, aes(x=eng_level))+geom_histogram()
mean(sudan$eng_level,na.rm=TRUE)
## [1] 6.264151
agg_rm <- subset(agg,!is.na(cultural_group))
ggplot(agg_rm,aes(group=cultural_group,x=eng_level,color=cultural_group,fill=cultural_group))+geom_density(adjust=2,alpha=0.1)+xlab('English Fluency')
table(hea$health_general_appraisal,hea$cultural_group)
##
## Ethiopian Karen Burmese Somali Somali Bantu Sudanese Swahili
## 1 34 2 18 8 1 5
## 2 40 5 3 10 0 7
## 3 31 11 4 9 0 4
## 4 12 18 1 5 0 3
## 5 0 3 0 4 0 0
hea_rm <- subset(hea,cultural_group!="Sudanese")
ggplot(hea_rm,aes(group=cultural_group,x=health_general_appraisal))+geom_histogram()+facet_grid(cultural_group ~ .)+xlab('Health')+ylab('Respondents')