Pana Data

The PANA data

Read the data in. (Make sure the readxl package is installed).

#install.packages('readxl')
library(readxl)
library(plyr)
jobs <- readxl::read_excel('PANA JOB results.xlsx')
agg <- readxl::read_excel('PANA aggregated results.xlsx')
hea_edu <- readxl::read_excel('PANA HEA_EDU results.xlsx')
hea <- readxl::read_excel('PANA HEA results.xlsx')

Example: Monthly Income and cultural group

How many of these data points are missing?

table(is.na(agg$monthly_income))

## 
## FALSE  TRUE 
##   124   384

Is there a relationship between missingness and cultural group?

table(agg$cultural_group,is.na(agg$monthly_income))

##                
##                 FALSE TRUE
##   Ethiopian        87   48
##   Karen Burmese     3  114
##   Somali           15   67
##   Somali Bantu      6   85
##   Sudanese          6   49
##   Swahili           7   18

Make sure the ggplot2 package is installed!

#install.packages(ggplot2)
library(ggplot2)
ggplot(agg,aes(x=cultural_group,group=cultural_group,y=monthly_income))+geom_boxplot()+xlab('Cultural Group')+ylab('Monthly Income')

Let’s make a table of some statistics for this data.

#install.packages(plyr)
agg_rm <- subset(agg,!is.na(cultural_group))
ddply(agg_rm,.(cultural_group),summarise,
      mean_monthly_income=mean(monthly_income,na.rm = T),
      min_monthly_income=min(monthly_income,na.rm = T),
      max_monthly_income=max(monthly_income,na.rm = T),
      N=length(which(!is.na(monthly_income))))

##   cultural_group mean_monthly_income min_monthly_income max_monthly_income
## 1      Ethiopian           1810.3793                  0               6250
## 2  Karen Burmese           1166.6667                100               2200
## 3         Somali           1881.0667                  0               3600
## 4   Somali Bantu            741.6667                  0               1300
## 5       Sudanese           1265.5000                600               1733
## 6        Swahili           1322.8571                280               2100
##    N
## 1 87
## 2  3
## 3 15
## 4  6
## 5  6
## 6  7

ggplot(agg_rm,aes(x=cultural_group,group=cultural_group,y=monthly_income))+geom_boxplot()+xlab('Cultural Group')+ylab('Monthly Income')

ggplot(agg_rm,aes(x=cultural_group,group=cultural_group,y=monthly_income))+geom_point()+xlab('Cultural Group')+ylab('Monthly Income')

Example: English level and time in the US

table(agg$year_arrival_usa)

## 
## 1980 1982 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 
##    1    3    2    1    1    1    1    5   10    3    1    2    1    3    2 
## 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 
##    3    3    5    4   11    4    8   28   11   15   18   15   10   13    9 
## 2012 2013 2014 2015 
##    6    8    9    8

agg$years_in_us <- 2016 - agg$year_arrival_usa

table(agg$eng_level)

## 
##   0   1   2   3   4   5   6   7   8   9  10  11 
##   3  90  25  24  27  41  35  43  41  30 119   7

ggplot(agg,aes(x=eng_level))+geom_histogram()

agg_rm <- subset(agg,cultural_group %in% c("Ethiopian", "Karen Burmese","Somali","Swahili"))
ggplot(agg_rm,aes(group=cultural_group,x=years_in_us,y=eng_level,color=cultural_group))+geom_jitter()+facet_grid(cultural_group ~ .)+xlab('Years in US')+ylab('English Fluency')

Example: A single group

sudan <- subset(agg,cultural_group == "Sudanese")

The Sudanese group is missing the year of arrival:

table(is.na(sudan$year_arrival_usa))

## 
## FALSE  TRUE 
##     1    54

We can still look at the spread of English:

ggplot(sudan, aes(x=eng_level))+geom_histogram()

mean(sudan$eng_level,na.rm=TRUE)

## [1] 6.264151

agg_rm <- subset(agg,!is.na(cultural_group))
ggplot(agg_rm,aes(group=cultural_group,x=eng_level,color=cultural_group,fill=cultural_group))+geom_density(adjust=2,alpha=0.1)+xlab('English Fluency')

Example: Health Appraisal

table(hea$health_general_appraisal,hea$cultural_group)

##    
##     Ethiopian Karen Burmese Somali Somali Bantu Sudanese Swahili
##   1        34             2     18            8        1       5
##   2        40             5      3           10        0       7
##   3        31            11      4            9        0       4
##   4        12            18      1            5        0       3
##   5         0             3      0            4        0       0

hea_rm <- subset(hea,cultural_group!="Sudanese")
ggplot(hea_rm,aes(group=cultural_group,x=health_general_appraisal))+geom_histogram()+facet_grid(cultural_group ~ .)+xlab('Health')+ylab('Respondents')