rm(list=ls())
## Install "pacman" package if not installed
# (remove the # symbol from the line below):
# install.packages("pacman")
## Load R packages:
pacman::p_load(data.table, tidyverse, haven, vtable,
psych, scales, weights, clipr, forcats,
stargazer, ggthemes, ggcharts, geomtextpath,
corrplot)
## Import BCL dataset:
ds <- read_sav("~/Desktop/uganda_project/BCL Dataset.sav")
ds <- as.data.table(ds)
## Extract labels from categorical variables:
ds$Country <- as_factor(ds$Country)Example Data Analysis and Visualization
BCL Dataset
Introduction
This is a sample analysis. I use the entirety of the “BCL dataset” csv file, and proceed to do three things:
Data importation / cleaning
Exploratory data visualization of several relevant variables
Factor analysis and reliability analysis of one particular scale found in the dataset
All of the R code used in the analysis, as well as the test results can be seen in the code chunk sections, and the plots should be rendered in a standard web browser. Please let me know if any part of this analysis / visualization is unclear.
Section 1: Data Cleaning and Exploratory Data Visualization
Variable: Country
lp01 <- ds %>% drop_na(Country) %>%
lollipop_chart(x = Country,
line_color = "black",
point_color = "black")+
labs(y = "Frequency",
title = "Sample size by country")+
theme_bw()
lp01Variable: Age
ds <- ds %>% drop_na(age)
ds$age <- as.numeric(ds$age)
summary(ds$age) Min. 1st Qu. Median Mean 3rd Qu. Max.
18.00 23.00 27.00 29.65 33.00 82.00
ds %>% drop_na(age)%>%
ggplot(aes(x = age))+
geom_histogram(color = "black",
fill = "gray",
bins = 50)+
geom_textvline(label = "Mean = 29.65",
xintercept = mean(ds$age),
vjust = 1.3,
lwd = 1.05,
linetype = 2)+
labs(x = "Age",
y = "Frequency",
title = "Age distribution (full sample)")+
theme_bw()ds %>% drop_na(Country)%>%
ggplot(aes(x = age,
y = Country,
color = Country))+
geom_point(show.legend = FALSE)+
labs(x = "Age",
title = "Age distribution by country")+
scale_color_colorblind()+
theme_bw()Variable: Survey duration
ds$seconds <- ds$Duration__in_seconds_
ds$minutes <- (ds$seconds/60)
ds$hours <- (ds$minutes/60)
summary(ds$minutes) Min. 1st Qu. Median Mean 3rd Qu. Max.
7.867 24.750 36.233 58.197 52.921 2000.617
ds %>% drop_na(Country)%>%
ggplot(aes(x = hours))+
geom_histogram(color = "black",
fill = "gray",
bins = 200)+
geom_textvline(label = "Mean = 0.97",
xintercept = mean(ds$hours),
vjust = 1.3,
lwd = 1.05,
linetype = 2)+
labs(x = "Hours",
y = "Frequency",
title = "Survey duration (full sample)")+
theme_bw()ds %>% drop_na(Country)%>%
ggplot(aes(x = hours,
y = Country,
color = Country))+
geom_point(show.legend = FALSE)+
labs(x = "Hours",
title = "Survey duration by country")+
scale_color_colorblind()+
theme_bw()Variable: Gender
ds$gend1 <- as_factor(ds$gend)
ds$gend2 <- as_factor(ds$gend_6_TEXT)
ds$gend3 <- ifelse(ds$gend2=="Female", "Female",
ifelse(ds$gend2=="IM FEMAIL", "Female",
ifelse(ds$gend2=="Male", "Male",
ifelse(ds$gend2=="male", "Male", ""))))
ds$gender <- ifelse(ds$gend1 == "Male", "Male",
ifelse(ds$gend3 == "Male", "Male",
ifelse(ds$gend1 == "Female", "Female",
ifelse(ds$gend3 == "Female", "Female", NA))))
table(ds$gender)
Female Male
754 1222
lp02 <- ds %>% drop_na(gender) %>%
lollipop_chart(x = gender,
line_color = "black",
point_color = "black")+
labs(x = "Gender",
y = "Frequency",
title = "Gender distribution")+
theme_bw()
lp02Variable: Wealth level
ds$wealth_level <- as.numeric(ds$wealth_4)
summary(ds$wealth_level) Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 28.00 50.00 46.24 55.00 100.00 5
ds %>% drop_na(wealth_level, Country, gender) %>%
ggplot(aes(y = wealth_level,
x = Country))+
geom_violin(fill = "grey")+
facet_wrap(~gender,
ncol = 2)+
labs(y = "Self-reported relative wealth status")+
coord_flip()+
theme_bw()Variable: Education
ds$edu1 <- as_factor(ds$Education)
lp04 <- ds %>% drop_na(edu1) %>%
group_by(edu1) %>%
summarise(count = n()) %>%
ggplot(aes(edu1, count)) +
geom_segment(aes(x=edu1, xend=edu1,
y=0, yend=count))+
geom_point()+
labs(x = "",
y = "Frequency",
title = "Education distribution (full sample)")+
coord_flip()+
theme_bw()
lp04Variable: Religion
ds$religion2 <- as_factor(ds$religion)
ds$religion2 <- as.character(ds$religion2)
ds$religion2 <- ifelse(ds$religion2=="Other (please specify)", "Other", ds$religion2)
lp06 <- ds %>% drop_na(Country, religion2) %>%
lollipop_chart(x = religion2,
line_color = "black",
point_color = "black")+
labs(x = "",
y = "Frequency",
title = "Religion distribution (full sample)")+
theme_bw()
lp06Section 2. Factor Analysis Example: BCL/BBL Endorsement
Correlation matrix
endorse <- cbind.data.frame(ds$endorse_style_1, ds$endorse_style_2,
ds$endorse_style_3, ds$endorse_style_4,
ds$endorse_style_5, ds$endorse_style_6)
endorse2 <- na.omit(endorse)
datamatrix <- cor(endorse2[, c(1:6)])
corrplot(datamatrix, method = "number")Kaiser-Meyer-Olkin (KMO) test of factorability:
KMO(r=cor(endorse2))Kaiser-Meyer-Olkin factor adequacy
Call: KMO(r = cor(endorse2))
Overall MSA = 0.9
MSA for each item =
ds$endorse_style_1 ds$endorse_style_2 ds$endorse_style_3 ds$endorse_style_4
0.93 0.92 0.90 0.87
ds$endorse_style_5 ds$endorse_style_6
0.85 0.95
Bartlett’s test of sphericity:
cortest.bartlett(endorse2)$chisq
[1] 6405.788
$p.value
[1] 0
$df
[1] 15
Parrallel test:
parallel <- fa.parallel(endorse2)Parallel analysis suggests that the number of factors = 2 and the number of components = 1
Factor Analysis: Two factor model with promax rotation:
fit <- factanal(endorse2, 2, rotation="promax")
print(fit, digits=2, cutoff=0.3, sort=TRUE)
Call:
factanal(x = endorse2, factors = 2, rotation = "promax")
Uniquenesses:
ds$endorse_style_1 ds$endorse_style_2 ds$endorse_style_3 ds$endorse_style_4
0.40 0.42 0.31 0.24
ds$endorse_style_5 ds$endorse_style_6
0.17 0.61
Loadings:
Factor1 Factor2
ds$endorse_style_1 0.55
ds$endorse_style_2 0.86
ds$endorse_style_3 0.65
ds$endorse_style_4 0.76
ds$endorse_style_5 0.86
ds$endorse_style_6 0.30 0.35
Factor1 Factor2
SS loadings 1.57 1.56
Proportion Var 0.26 0.26
Cumulative Var 0.26 0.52
Factor Correlations:
Factor1 Factor2
Factor1 1.00 0.84
Factor2 0.84 1.00
Test of the hypothesis that 2 factors are sufficient.
The chi square statistic is 6.26 on 4 degrees of freedom.
The p-value is 0.18
Reliability of BCL/BBL Endorsement Scale (Chronbach’s Alpha)
## Reliability of endorsement scale:
psych::alpha(endorse)
Reliability analysis
Call: psych::alpha(x = endorse)
raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
0.9 0.9 0.89 0.59 8.6 0.0036 4.7 1.7 0.56
95% confidence boundaries
lower alpha upper
Feldt 0.89 0.9 0.9
Duhachek 0.89 0.9 0.9
Reliability if an item is dropped:
raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r
ds$endorse_style_1 0.88 0.88 0.86 0.59 7.1 0.0043 0.0097
ds$endorse_style_2 0.89 0.89 0.87 0.61 7.9 0.0040 0.0088
ds$endorse_style_3 0.87 0.87 0.85 0.58 6.8 0.0045 0.0088
ds$endorse_style_4 0.87 0.87 0.85 0.57 6.6 0.0047 0.0061
ds$endorse_style_5 0.86 0.86 0.84 0.56 6.4 0.0048 0.0051
ds$endorse_style_6 0.90 0.90 0.88 0.63 8.6 0.0037 0.0051
med.r
ds$endorse_style_1 0.55
ds$endorse_style_2 0.63
ds$endorse_style_3 0.55
ds$endorse_style_4 0.56
ds$endorse_style_5 0.55
ds$endorse_style_6 0.63
Item statistics
n raw.r std.r r.cor r.drop mean sd
ds$endorse_style_1 1902 0.82 0.81 0.76 0.72 4.8 2.1
ds$endorse_style_2 1902 0.77 0.77 0.69 0.66 4.5 2.0
ds$endorse_style_3 1902 0.84 0.84 0.80 0.76 4.8 2.0
ds$endorse_style_4 1904 0.86 0.85 0.83 0.78 5.0 2.2
ds$endorse_style_5 1902 0.88 0.87 0.86 0.80 5.1 2.2
ds$endorse_style_6 1899 0.73 0.72 0.63 0.60 4.5 2.1
Non missing response frequency for each item
1 2 3 4 5 6 7 miss
ds$endorse_style_1 0.13 0.08 0.08 0.10 0.14 0.14 0.32 0.05
ds$endorse_style_2 0.12 0.09 0.09 0.14 0.18 0.16 0.22 0.05
ds$endorse_style_3 0.11 0.07 0.09 0.12 0.16 0.17 0.28 0.05
ds$endorse_style_4 0.13 0.06 0.06 0.08 0.13 0.15 0.39 0.04
ds$endorse_style_5 0.13 0.06 0.07 0.08 0.12 0.14 0.41 0.05
ds$endorse_style_6 0.14 0.09 0.09 0.14 0.16 0.14 0.24 0.05