This study uses the Wealth Index as the dependent variable to investigate the prevalence and contributing factors of poverty in Somaliland. Community-level factors (location and area), household-level factors (access to safe drinking water and sanitation facilities), and individual characteristics (age, sex, education, number of household members, live children, and marital status of the household head) are examples of independent variables. All of these factors combine together to offer a thorough framework for examining the causes of poverty.
# Load the dataset (update the path to your dataset file)
# Load haven package
library(haven)
SLDHS<-read_dta("~/SLDHS.dta")
#Drop Missing Values with the SLDHS
# Load the dplyr package
library(dplyr)
data_cleaned <- SLDHS %>%
filter(!is.na(V190), !is.na(V024), !is.na(V025),
!is.na(V106), !is.na(V152), !is.na(V151),
!is.na(V136), !is.na(V201), !is.na(V501),
!is.na(V113), !is.na(V116))
# Check dimensions of the cleaned data
dim(data_cleaned)
## [1] 14514 563
# Verify no missing values remain in specific variables
sapply(data_cleaned[, c("V190", "V024", "V025", "V106", "V152", "V151",
"V136", "V201", "V501", "V113", "V116")], function(x) sum(is.na(x)))
## V190 V024 V025 V106 V152 V151 V136 V201 V501 V113 V116
## 0 0 0 0 0 0 0 0 0 0 0
# Select only the relevant variables
# Load the dplyr package
library(dplyr)
data_relevant <- data_cleaned %>%
select("V190", "V024", "V025", "V106", "V152", "V151", "V136", "V201", "V501", "V113", "V116")
#1. List All Variables in the Dataset
# Ensure data_relevant exists before listing variables
if (exists("data_relevant")) {
# Display the structure of the dataset
str(data_relevant)
} else {
stop("The variable 'data_relevant' does not exist. Please create it first.")
}
## tibble [14,514 × 11] (S3: tbl_df/tbl/data.frame)
## $ V190: dbl+lbl [1:14514] 5, 5, 5, 5, 5, 3, 3, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3...
## ..@ label : chr "Wealth index combined"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:5] 1 2 3 4 5
## .. ..- attr(*, "names")= chr [1:5] "Lowest" "Second" "Middle" "Fourth" ...
## $ V024: dbl+lbl [1:14514] 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, ...
## ..@ label : chr "Region"
## ..@ format.stata: chr "%2.0f"
## ..@ labels : Named num [1:6] 11 12 13 14 15 16
## .. ..- attr(*, "names")= chr [1:6] "Awdal" " Marodijeh" "Sahil" "Togdheer" ...
## $ V025: dbl+lbl [1:14514] 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## ..@ label : chr "Type of place of residence"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:6] 1 2 3 4 5 6
## .. ..- attr(*, "names")= chr [1:6] "Rural" "Urban" "Nomadic" "Rural IDP" ...
## $ V106: dbl+lbl [1:14514] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## ..@ label : chr "Highest educational level"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:4] 0 1 2 3
## .. ..- attr(*, "names")= chr [1:4] "No Education" "Primary" "Secondary" "Higher"
## $ V152: num [1:14514] 23 23 23 23 23 61 61 23 23 23 ...
## ..- attr(*, "label")= chr "Age of household head"
## ..- attr(*, "format.stata")= chr "%2.0f"
## $ V151: num [1:14514] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "label")= chr "Sex of household head"
## ..- attr(*, "format.stata")= chr "%1.0f"
## $ V136: num [1:14514] 6 6 6 6 6 4 4 6 6 6 ...
## ..- attr(*, "label")= chr "Number of household members (listed)"
## ..- attr(*, "format.stata")= chr "%1.0f"
## $ V201: num [1:14514] 5 5 5 5 5 2 2 4 4 4 ...
## ..- attr(*, "label")= chr "Total children ever born"
## ..- attr(*, "format.stata")= chr "%2.0f"
## $ V501: dbl+lbl [1:14514] 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## ..@ label : chr "Current marital status"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:4] 0 1 2 3
## .. ..- attr(*, "names")= chr [1:4] "Never Married" "Married" "Divorced" "Widowed"
## $ V113: num [1:14514] 11 11 11 11 11 13 13 13 13 13 ...
## ..- attr(*, "label")= chr "Source of drinking water"
## ..- attr(*, "format.stata")= chr "%2.0f"
## $ V116: num [1:14514] 23 23 23 23 23 61 61 23 23 23 ...
## ..- attr(*, "label")= chr "Type of toilet facility"
## ..- attr(*, "format.stata")= chr "%2.0f"
Identify and list all the variables provided in the variable code list: # View the column names to identify the variables in the dataset colnames(data_relevant) # List the specific variables provided in the code list for confirmation code_list <- c(“V190”, “V024”, “V025”, “V106”, “V152”, “V151”, “V136”, “V201”, “V501”, “V113”, “V116”)
Determine the data types of each variable: # Check data types of each variable sapply(data_relevant, class)
Briefly Explain the difference between numeric and factor variables Numeric Variables: These variables contain numerical data and can be used for mathematical operations. They represent continuous or discrete quantities (e.g., age, income). Factor Variables These are categorical variables that represent distinct groups or categories. They are stored as integers internally but have corresponding labels (e.g., gender, marital status).
mean(data_cleaned$V136, na.rm= TRUE)
## [1] 5.407744
median(data_cleaned$V136, na.rm= TRUE)
## [1] 6
sd(data_cleaned$V136, na.rm= TRUE)
## [1] 2.203568
freq_table <- table(data_cleaned$V106)
education_labels <- c("No Education", "Primary", "Secondary", "Higher")
names(freq_table) <- education_labels
print(freq_table)
## No Education Primary Secondary Higher
## 12595 1557 278 84
proportions_v190 <- prop.table(table(data_cleaned$V190))
# Display the proportions
proportions_v190
##
## 1 2 3 4 5
## 0.3690919 0.1726609 0.1164393 0.1605347 0.1812733
# Assign descriptive labels to the wealth quintiles
data_cleaned$V190 <- factor(data_cleaned$V190,
levels = c(1, 2, 3, 4, 5),
labels = c("Lowest", "Second", "Middle", "Fourth", "Highest"))
# Recalculate proportions with labels
labeled_proportions <- prop.table(table(data_cleaned$V190))
# Display labeled proportions
labeled_proportions
##
## Lowest Second Middle Fourth Highest
## 0.3690919 0.1726609 0.1164393 0.1605347 0.1812733
age_household_head <- data_cleaned$V151
num_living_children <- data_cleaned$V201
correlation <- cor(age_household_head, num_living_children, use = "complete.obs")
cat("Correlation Coefficient between age of household head and number of living children:", correlation, "\n")
## Correlation Coefficient between age of household head and number of living children: 0.005892517
To calculate the correlation coefficient between the household head’s age (V151) and the number of living children (V201) in R, use the cor() function. The default Pearson’s correlation measures the linear relationship. For instance, cor(data_relevant\(V151, data_relevant\)V201, use = “complete.obs”) computes the correlation, yielding a value between -1 and 1, indicating the relationship’s strength and direction.
#1. Create a new variable called poverty_status based on the V190 variable (wealth quintile):
# Step 1: Ensure V190 is correctly converted to numeric
# Check the structure of V190
str(data_relevant$V190)
## dbl+lbl [1:14514] 5, 5, 5, 5, 5, 3, 3, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,...
## @ label : chr "Wealth index combined"
## @ format.stata: chr "%1.0f"
## @ labels : Named num [1:5] 1 2 3 4 5
## ..- attr(*, "names")= chr [1:5] "Lowest" "Second" "Middle" "Fourth" ...
# Convert V190 to numeric (if it's a factor or character)
data_relevant$V190 <- as.numeric(as.character(data_relevant$V190))
# Verify that V190 has valid numeric values
summary(data_relevant$V190)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 2.612 4.000 5.000
# Step 2: Check for missing values in V190 and remove them
data_relevant <- data_relevant[!is.na(data_relevant$V190), ]
# Step 3: Create the poverty_status variable
data_relevant$poverty_status <- ifelse(data_relevant$V190 <= 3, 1, 2)
# Step 4: Label the poverty_status variable
data_relevant$poverty_status <- factor(data_relevant$poverty_status,
levels = c(1, 2),
labels = c("Poor", "Non-Poor"))
# Step 5: Verify the new variable
table(data_relevant$poverty_status)
##
## Poor Non-Poor
## 9553 4961
summary(data_relevant)
## V190 V024 V025 V106
## Min. :1.000 Min. :11.00 Min. :1.000 Min. :0.0000
## 1st Qu.:1.000 1st Qu.:13.00 1st Qu.:1.000 1st Qu.:0.0000
## Median :2.000 Median :14.00 Median :1.000 Median :0.0000
## Mean :2.612 Mean :14.02 Mean :1.473 Mean :0.1629
## 3rd Qu.:4.000 3rd Qu.:15.00 3rd Qu.:2.000 3rd Qu.:0.0000
## Max. :5.000 Max. :16.00 Max. :2.000 Max. :3.0000
## V152 V151 V136 V201
## Min. :11.00 Min. :1.000 Min. :1.000 Min. : 0.000
## 1st Qu.:13.00 1st Qu.:1.000 1st Qu.:4.000 1st Qu.: 4.000
## Median :22.00 Median :1.000 Median :6.000 Median : 6.000
## Mean :26.79 Mean :1.397 Mean :5.408 Mean : 6.253
## 3rd Qu.:23.00 3rd Qu.:2.000 3rd Qu.:7.000 3rd Qu.: 8.000
## Max. :96.00 Max. :2.000 Max. :9.000 Max. :19.000
## V501 V113 V116 poverty_status
## Min. :1.00 Min. :11.00 Min. :11.00 Poor :9553
## 1st Qu.:1.00 1st Qu.:12.00 1st Qu.:13.00 Non-Poor:4961
## Median :1.00 Median :31.00 Median :22.00
## Mean :1.14 Mean :34.44 Mean :26.79
## 3rd Qu.:1.00 3rd Qu.:61.00 3rd Qu.:23.00
## Max. :3.00 Max. :96.00 Max. :96.00
data_cleaned$V113_recode <- ifelse(data_cleaned$V113 %in% c(11, 12, 13, 21, 31, 41),
"Improved", "Unimproved")
# Verify the classification
table(data_cleaned$V113_recode)
##
## Improved Unimproved
## 7474 7040
# Classify Type of Toilet Facility (V116)
data_cleaned$V116_recode <- ifelse(data_cleaned$V116 %in% c(11, 12, 13, 14, 15, 21, 41),
"Improved", "Unimproved")
# Verify the classification
table(data_cleaned$V116_recode)
##
## Improved Unimproved
## 6249 8265
Remove Missing Values: If the number of missing values is small, you can remove the rows containing missing data using na.omit() or filtering techniques. Impute Missing Values: The mean, median, or other statistical estimations can be used to impute missing values for numerical variables. The mode or a predicted value derived from other factors can be used to categorical variables. Flag Missing Values: To identify missing data for additional reporting or analysis, create a flag variable.
# Create a histogram for V136
hist(data_relevant$V136,
main = "Distribution of Number of Household Members",
xlab = "Number of Household Members",
ylab = "Frequency",
col = "red",
breaks = 10)
# Create a bar chart for poverty_status
# Step 1: Calculate proportions for poverty_status
poverty_status_proportions <- prop.table(table(data_relevant$poverty_status))
# Step 2: Create the bar chart
barplot(
poverty_status_proportions,
main = "Proportion of Households by Poverty Status",
xlab = "Poverty Status",
ylab = "Proportion",
col = c("blue", "green"), # Colors for the bars
names.arg = c("Poor", "Non-Poor") # Label categories
)
library(ggplot2)
boxplot(data_relevant$V201 ~ data_relevant$poverty_status,
main = "Number of Living Children by Poverty Status",
xlab = "Poverty Status",
ylab = "Number of Living Children",
col = c("blue","cyan"))
#Explanation: The data insights are successfully communicated with the use of appropriate visualization approaches. Take the following example: *Histograms may be used to comprehend the distribution of continuous data (like V136). For categorical variables (like poverty_status), bar charts are an excellent way to show proportions or counts. Oxplots are perfect for comparing distributions between groups and locating outliers (like V201 by poverty_status).