Example Data Analysis and Visualization
BCL Dataset

Author

Gagan Atreya

Introduction

This is a sample analysis. I use the entirety of the “BCL dataset” csv file, and proceed to do three things:

  • Data importation / cleaning

  • Exploratory data visualization of several relevant variables

  • Factor analysis and reliability analysis of one particular scale found in the dataset

All of the R code used in the analysis, as well as the test results can be seen in the code chunk sections, and the plots should be rendered in a standard web browser. Please let me know if any part of this analysis / visualization is unclear.

Section 1: Data Cleaning and Exploratory Data Visualization

rm(list=ls())

## Install "pacman" package if not installed
# (remove the # symbol from the line below):
# install.packages("pacman")

## Load R packages:
pacman::p_load(data.table, tidyverse, haven, vtable, 
               psych, scales, weights, clipr, forcats,
               stargazer, ggthemes, ggcharts, geomtextpath,
               corrplot)

## Import BCL dataset:
ds <- read_sav("~/Desktop/uganda_project/BCL Dataset.sav")
ds <- as.data.table(ds)

## Extract labels from categorical variables:
ds$Country <- as_factor(ds$Country)

Variable: Country

lp01 <- ds %>% drop_na(Country) %>%
lollipop_chart(x = Country,
               line_color = "black",
               point_color = "black")+
  labs(y = "Frequency",
       title = "Sample size by country")+
  theme_bw()

lp01

Variable: Age

ds <- ds %>% drop_na(age)
ds$age <- as.numeric(ds$age)

summary(ds$age)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  18.00   23.00   27.00   29.65   33.00   82.00 
ds %>% drop_na(age)%>%
ggplot(aes(x = age))+
  geom_histogram(color = "black",
                 fill = "gray",
                 bins = 50)+
  geom_textvline(label = "Mean = 29.65", 
                 xintercept = mean(ds$age), 
                 vjust = 1.3, 
                 lwd = 1.05, 
                 linetype = 2)+
  labs(x = "Age", 
       y = "Frequency", 
       title = "Age distribution (full sample)")+
  theme_bw()

ds %>% drop_na(Country)%>%
ggplot(aes(x = age, 
           y = Country, 
           color = Country))+
  geom_point(show.legend = FALSE)+
  labs(x = "Age",
       title = "Age distribution by country")+
  scale_color_colorblind()+
  theme_bw()

Variable: Survey duration

ds$seconds <- ds$Duration__in_seconds_
ds$minutes <- (ds$seconds/60)
ds$hours <- (ds$minutes/60)

summary(ds$minutes)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
   7.867   24.750   36.233   58.197   52.921 2000.617 
ds %>% drop_na(Country)%>%
ggplot(aes(x = hours))+
  geom_histogram(color = "black",
                 fill = "gray",
                 bins = 200)+
  geom_textvline(label = "Mean = 0.97", 
                 xintercept = mean(ds$hours), 
                 vjust = 1.3, 
                 lwd = 1.05, 
                 linetype = 2)+
  labs(x = "Hours", 
       y = "Frequency", 
       title = "Survey duration (full sample)")+
  theme_bw()

ds %>% drop_na(Country)%>%
ggplot(aes(x = hours, 
           y = Country, 
           color = Country))+
  geom_point(show.legend = FALSE)+
  labs(x = "Hours",
       title = "Survey duration by country")+
  scale_color_colorblind()+
  theme_bw()

Variable: Gender

ds$gend1 <- as_factor(ds$gend)
ds$gend2 <- as_factor(ds$gend_6_TEXT)

ds$gend3 <- ifelse(ds$gend2=="Female", "Female",
            ifelse(ds$gend2=="IM FEMAIL", "Female",
            ifelse(ds$gend2=="Male", "Male",
            ifelse(ds$gend2=="male", "Male", ""))))

ds$gender <- ifelse(ds$gend1 == "Male", "Male",
             ifelse(ds$gend3 == "Male", "Male",
             ifelse(ds$gend1 == "Female", "Female",
             ifelse(ds$gend3 == "Female", "Female", NA))))

table(ds$gender)

Female   Male 
   754   1222 
lp02 <- ds %>% drop_na(gender) %>%
lollipop_chart(x = gender,
               line_color = "black",
               point_color = "black")+
  labs(x = "Gender", 
       y = "Frequency",
       title = "Gender distribution")+
  theme_bw()

lp02

Variable: Wealth level

ds$wealth_level <- as.numeric(ds$wealth_4)
summary(ds$wealth_level)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00   28.00   50.00   46.24   55.00  100.00       5 
ds %>% drop_na(wealth_level, Country, gender) %>%
  ggplot(aes(y = wealth_level, 
             x = Country))+
  geom_violin(fill = "grey")+
  facet_wrap(~gender, 
             ncol = 2)+
  labs(y = "Self-reported relative wealth status")+
  coord_flip()+
  theme_bw()

Variable: Education

ds$edu1 <- as_factor(ds$Education)

lp04 <- ds %>% drop_na(edu1) %>%
  group_by(edu1) %>% 
  summarise(count = n()) %>% 
  ggplot(aes(edu1, count)) + 
  geom_segment(aes(x=edu1, xend=edu1, 
                   y=0, yend=count))+ 
  geom_point()+
  labs(x = "", 
       y = "Frequency", 
       title = "Education distribution (full sample)")+
  coord_flip()+
  theme_bw()

lp04

Variable: Religion

ds$religion2 <- as_factor(ds$religion)
ds$religion2 <- as.character(ds$religion2)

ds$religion2 <- ifelse(ds$religion2=="Other (please specify)", "Other", ds$religion2)

lp06 <- ds %>% drop_na(Country, religion2) %>%
lollipop_chart(x = religion2,
               line_color = "black",
               point_color = "black")+
  labs(x = "",
       y = "Frequency",
       title = "Religion distribution (full sample)")+
  theme_bw()

lp06

Section 2. Factor Analysis Example: BCL/BBL Endorsement

Correlation matrix

endorse <- cbind.data.frame(ds$endorse_style_1, ds$endorse_style_2, 
                            ds$endorse_style_3, ds$endorse_style_4, 
                            ds$endorse_style_5, ds$endorse_style_6)

endorse2 <- na.omit(endorse)

datamatrix <- cor(endorse2[, c(1:6)])
corrplot(datamatrix, method = "number")

Kaiser-Meyer-Olkin (KMO) test of factorability:

KMO(r=cor(endorse2))
Kaiser-Meyer-Olkin factor adequacy
Call: KMO(r = cor(endorse2))
Overall MSA =  0.9
MSA for each item = 
ds$endorse_style_1 ds$endorse_style_2 ds$endorse_style_3 ds$endorse_style_4 
              0.93               0.92               0.90               0.87 
ds$endorse_style_5 ds$endorse_style_6 
              0.85               0.95 

Bartlett’s test of sphericity:

cortest.bartlett(endorse2)
$chisq
[1] 6405.788

$p.value
[1] 0

$df
[1] 15

Parrallel test:

parallel <- fa.parallel(endorse2)

Parallel analysis suggests that the number of factors =  2  and the number of components =  1 

Factor Analysis: Two factor model with promax rotation:

fit <- factanal(endorse2, 2, rotation="promax")

print(fit, digits=2, cutoff=0.3, sort=TRUE)

Call:
factanal(x = endorse2, factors = 2, rotation = "promax")

Uniquenesses:
ds$endorse_style_1 ds$endorse_style_2 ds$endorse_style_3 ds$endorse_style_4 
              0.40               0.42               0.31               0.24 
ds$endorse_style_5 ds$endorse_style_6 
              0.17               0.61 

Loadings:
                   Factor1 Factor2
ds$endorse_style_1  0.55          
ds$endorse_style_2  0.86          
ds$endorse_style_3  0.65          
ds$endorse_style_4          0.76  
ds$endorse_style_5          0.86  
ds$endorse_style_6  0.30    0.35  

               Factor1 Factor2
SS loadings       1.57    1.56
Proportion Var    0.26    0.26
Cumulative Var    0.26    0.52

Factor Correlations:
        Factor1 Factor2
Factor1    1.00    0.84
Factor2    0.84    1.00

Test of the hypothesis that 2 factors are sufficient.
The chi square statistic is 6.26 on 4 degrees of freedom.
The p-value is 0.18 

Reliability of BCL/BBL Endorsement Scale (Chronbach’s Alpha)

## Reliability of endorsement scale:

psych::alpha(endorse)

Reliability analysis   
Call: psych::alpha(x = endorse)

  raw_alpha std.alpha G6(smc) average_r S/N    ase mean  sd median_r
       0.9       0.9    0.89      0.59 8.6 0.0036  4.7 1.7     0.56

    95% confidence boundaries 
         lower alpha upper
Feldt     0.89   0.9   0.9
Duhachek  0.89   0.9   0.9

 Reliability if an item is dropped:
                   raw_alpha std.alpha G6(smc) average_r S/N alpha se  var.r
ds$endorse_style_1      0.88      0.88    0.86      0.59 7.1   0.0043 0.0097
ds$endorse_style_2      0.89      0.89    0.87      0.61 7.9   0.0040 0.0088
ds$endorse_style_3      0.87      0.87    0.85      0.58 6.8   0.0045 0.0088
ds$endorse_style_4      0.87      0.87    0.85      0.57 6.6   0.0047 0.0061
ds$endorse_style_5      0.86      0.86    0.84      0.56 6.4   0.0048 0.0051
ds$endorse_style_6      0.90      0.90    0.88      0.63 8.6   0.0037 0.0051
                   med.r
ds$endorse_style_1  0.55
ds$endorse_style_2  0.63
ds$endorse_style_3  0.55
ds$endorse_style_4  0.56
ds$endorse_style_5  0.55
ds$endorse_style_6  0.63

 Item statistics 
                      n raw.r std.r r.cor r.drop mean  sd
ds$endorse_style_1 1902  0.82  0.81  0.76   0.72  4.8 2.1
ds$endorse_style_2 1902  0.77  0.77  0.69   0.66  4.5 2.0
ds$endorse_style_3 1902  0.84  0.84  0.80   0.76  4.8 2.0
ds$endorse_style_4 1904  0.86  0.85  0.83   0.78  5.0 2.2
ds$endorse_style_5 1902  0.88  0.87  0.86   0.80  5.1 2.2
ds$endorse_style_6 1899  0.73  0.72  0.63   0.60  4.5 2.1

Non missing response frequency for each item
                      1    2    3    4    5    6    7 miss
ds$endorse_style_1 0.13 0.08 0.08 0.10 0.14 0.14 0.32 0.05
ds$endorse_style_2 0.12 0.09 0.09 0.14 0.18 0.16 0.22 0.05
ds$endorse_style_3 0.11 0.07 0.09 0.12 0.16 0.17 0.28 0.05
ds$endorse_style_4 0.13 0.06 0.06 0.08 0.13 0.15 0.39 0.04
ds$endorse_style_5 0.13 0.06 0.07 0.08 0.12 0.14 0.41 0.05
ds$endorse_style_6 0.14 0.09 0.09 0.14 0.16 0.14 0.24 0.05