# Setup
##Install and load the necessary packages to reproduce the report
library(haven) # Useful for importing SPSS, SAS, STATA etc. data files
library(foreign) # Useful for importing SPSS, SAS, STATA etc. data files
library(haven)
SLDHS<-read_dta("~/SLDHS12.dta")
str(SLDHS$V106)
## dbl+lbl [1:17686] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## @ label : chr "Highest educational level"
## @ format.stata: chr "%1.0f"
## @ labels : Named num [1:4] 0 1 2 3
## ..- attr(*, "names")= chr [1:4] "No Education" "Primary" "Secondary" "Higher"
# Loading the dplyr package
library(dplyr)
# Select only the variable code list
sLDHS_DATA1<-SLDHS %>%
select("V190", "V024", "V025", "V106", "V152", "V151", "V136", "V201", "V501", "V113", "V116")
str(sLDHS_DATA1)
## tibble [17,686 x 11] (S3: tbl_df/tbl/data.frame)
## $ V190: dbl+lbl [1:17686] 5, 5, 5, 5, 5, 3, 3, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3...
## ..@ label : chr "Wealth index combined"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:5] 1 2 3 4 5
## .. ..- attr(*, "names")= chr [1:5] "Lowest" "Second" "Middle" "Fourth" ...
## $ V024: dbl+lbl [1:17686] 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, ...
## ..@ label : chr "Region"
## ..@ format.stata: chr "%2.0f"
## ..@ labels : Named num [1:6] 11 12 13 14 15 16
## .. ..- attr(*, "names")= chr [1:6] "Awdal" " Marodijeh" "Sahil" "Togdheer" ...
## $ V025: dbl+lbl [1:17686] 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## ..@ label : chr "Type of place of residence"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:6] 1 2 3 4 5 6
## .. ..- attr(*, "names")= chr [1:6] "Rural" "Urban" "Nomadic" "Rural IDP" ...
## $ V106: dbl+lbl [1:17686] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## ..@ label : chr "Highest educational level"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:4] 0 1 2 3
## .. ..- attr(*, "names")= chr [1:4] "No Education" "Primary" "Secondary" "Higher"
## $ V152: num [1:17686] 23 23 23 23 23 61 61 23 23 23 ...
## ..- attr(*, "label")= chr "Age of household head"
## ..- attr(*, "format.stata")= chr "%2.0f"
## $ V151: num [1:17686] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "label")= chr "Sex of household head"
## ..- attr(*, "format.stata")= chr "%1.0f"
## $ V136: num [1:17686] 6 6 6 6 6 4 4 6 6 6 ...
## ..- attr(*, "label")= chr "Number of household members (listed)"
## ..- attr(*, "format.stata")= chr "%1.0f"
## $ V201: num [1:17686] 5 5 5 5 5 2 2 4 4 4 ...
## ..- attr(*, "label")= chr "Total children ever born"
## ..- attr(*, "format.stata")= chr "%2.0f"
## $ V501: dbl+lbl [1:17686] 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## ..@ label : chr "Current marital status"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:4] 0 1 2 3
## .. ..- attr(*, "names")= chr [1:4] "Never Married" "Married" "Divorced" "Widowed"
## $ V113: num [1:17686] 11 11 11 11 11 13 13 13 13 13 ...
## ..- attr(*, "label")= chr "Source of drinking water"
## ..- attr(*, "format.stata")= chr "%2.0f"
## $ V116: num [1:17686] 23 23 23 23 23 61 61 23 23 23 ...
## ..- attr(*, "label")= chr "Type of toilet facility"
## ..- attr(*, "format.stata")= chr "%2.0f"
mean(sLDHS_DATA1$V136,na.rm = TRUE )
## [1] 5.107781
median(sLDHS_DATA1$V136, na.rm = TRUE)
## [1] 5
sd(sLDHS_DATA1$V136, na.rm = TRUE)
## [1] 2.475511
table(sLDHS_DATA1$V106)
##
## 0 1 2 3
## 15287 1991 311 97
# Create a Proportion of household for V106
prop.table(table(sLDHS_DATA1$V106))
##
## 0 1 2 3
## 0.864355988 0.112574918 0.017584530 0.005484564
sum(is.na(sLDHS_DATA1$V152))
## [1] 862
sum(is.na(sLDHS_DATA1$V201))
## [1] 13
data_clean <- na.omit(sLDHS_DATA1[, c("V152", "V201")])
correlation <- cor(data_clean$V152, data_clean$V201, method = "pearson")
print(correlation)
## [1] 0.04612714
# There is weak direct correlation between V152 and V201 it means if V152 increases the V201 increase very little value.
# Load the dplyr package
library(dplyr)
sLDHS_DATA1$poverty_status <- ifelse(sLDHS_DATA1$V190 %in% c(1, 2,3), "Poor",
ifelse(sLDHS_DATA1$V190 %in% c(4, 5), "Non-Poor", NA))
table(sLDHS_DATA1$poverty_status)
##
## Non-Poor Poor
## 6293 11393
summary(sLDHS_DATA1)
## V190 V024 V025 V106
## Min. :1.000 Min. :11.00 Min. :1.000 Min. :0.0000
## 1st Qu.:1.000 1st Qu.:13.00 1st Qu.:1.000 1st Qu.:0.0000
## Median :2.000 Median :14.00 Median :1.000 Median :0.0000
## Mean :2.662 Mean :13.95 Mean :1.498 Mean :0.1642
## 3rd Qu.:4.000 3rd Qu.:15.00 3rd Qu.:2.000 3rd Qu.:0.0000
## Max. :5.000 Max. :16.00 Max. :2.000 Max. :3.0000
##
## V152 V151 V136 V201
## Min. :11.0 Min. :1.000 Min. :0.000 Min. : 0.000
## 1st Qu.:13.0 1st Qu.:1.000 1st Qu.:3.000 1st Qu.: 4.000
## Median :22.0 Median :1.000 Median :5.000 Median : 6.000
## Mean :26.2 Mean :1.376 Mean :5.108 Mean : 6.274
## 3rd Qu.:23.0 3rd Qu.:2.000 3rd Qu.:7.000 3rd Qu.: 8.000
## Max. :96.0 Max. :2.000 Max. :9.000 Max. :19.000
## NA's :862 NA's :858 NA's :2303 NA's :13
## V501 V113 V116 poverty_status
## Min. :1.000 Min. :11.00 Min. :11.0 Length:17686
## 1st Qu.:1.000 1st Qu.:12.00 1st Qu.:13.0 Class :character
## Median :1.000 Median :31.00 Median :22.0 Mode :character
## Mean :1.138 Mean :33.44 Mean :26.2
## 3rd Qu.:1.000 3rd Qu.:61.00 3rd Qu.:23.0
## Max. :3.000 Max. :96.00 Max. :96.0
## NA's :862 NA's :862
## Recode V116 into new variable 'toilet_facility' using base R
## Recode V116 into "improved" and "unimproved"
sLDHS_DATA1$toilet_facility <- NA
sLDHS_DATA1$toilet_facility[sLDHS_DATA1$V116 %in% c(11, 12, 13, 21, 22, 31)] <- "improved"
sLDHS_DATA1$toilet_facility[sLDHS_DATA1$V116 %in% c(14, 15, 23, 41, 51, 61, 96)] <- "unimproved"
table(sLDHS_DATA1$toilet_facility)
##
## improved unimproved
## 9520 7304
#Recode V113 into a new variable 'Source_of_drinking_water' using base R
#Recode V113 into "improved" and "unimproved"
sLDHS_DATA1$Source_of_drinking_water <- NA
# Initialize the 'Source_of_drinking_water' variable as NA
sLDHS_DATA1$Source_of_drinking_water[sLDHS_DATA1$V113 %in% c(11, 12, 13, 14, 21, 51, 61, 71, 72)] <- "improved"
sLDHS_DATA1$Source_of_drinking_water[sLDHS_DATA1$V113 %in% c(32, 42, 81, 96)] <- "unimproved"
table(sLDHS_DATA1$Source_of_drinking_water)
##
## improved unimproved
## 12355 2311
# To handle missing values in dataset you may follow the below steps:
any(is.na(sLDHS_DATA1))
## [1] TRUE
sum(is.na(sLDHS_DATA1))
## [1] 9642
library(ggplot2)
hist(sLDHS_DATA1$V136, breaks = 5 )
barplot(table(sLDHS_DATA1$poverty_status))
2. Create a boxplot to compare the number of living children (“V201”)
between poor and non-poor households (“poverty_status”)
boxplot(sLDHS_DATA1$V201,main="SLDHS$poverty_status",col="blue")
Choosing the right visualization in R (or any data visualization tool) is crucial for effective communication. Different visualization techniques highlight different aspects of the data, and using the wrong one can mislead the audience or obscure important information.
Best for showing the distribution of a single numeric variable. They display the frequency of data points falling within specific ranges (bins). Useful for identifying patterns like skewness, modality (number of peaks), and outliers.
Best for comparing the frequencies or values of different categorical (factor) variables or groups. Each bar represents a category, and its height shows its corresponding value. Avoid using bar charts for continuous numeric data.
Excellent for comparing the distribution of a numeric variable across different categorical (factor) groups. They show the median, quartiles, and potential outliers for each group, providing a concise summary of the central tendency, spread, and potential extreme values