1:Data Structures in R Software (10 points)
#Load and Clean the Data 1. Import the SDHS dataset into R: # Install the haven package if you haven’t already # Load the haven package
# Load the dataset (update the path to your dataset file)
# Load haven package
library(haven)
SLDHS<-read_dta("~/SLDHS.dta")
#Drop Missing Values with the SLDHS
# Load dplyr
library(dplyr)
data_cleaned <- SLDHS %>%
filter(!is.na(V190), !is.na(V024), !is.na(V025),
!is.na(V106), !is.na(V152), !is.na(V151),
!is.na(V136), !is.na(V201), !is.na(V501),
!is.na(V113), !is.na(V116))
# Check dimensions of the cleaned data
dim(data_cleaned)
## [1] 14514 563
# Verify no missing values remain in specific variables
sapply(data_cleaned[, c("V190", "V024", "V025", "V106", "V152", "V151",
"V136", "V201", "V501", "V113", "V116")], function(x) sum(is.na(x)))
## V190 V024 V025 V106 V152 V151 V136 V201 V501 V113 V116
## 0 0 0 0 0 0 0 0 0 0 0
# Select only the relevant variables
# Load the dplyr package
library(dplyr)
data_relevant <- data_cleaned %>%
select("V190", "V024", "V025", "V106", "V152", "V151", "V136", "V201", "V501", "V113", "V116")
#1. list ALL variables in the Dateset
# Ensure data_relevant exists before listing variables
if (exists("data_relevant")) {
# Display the structure of the dataset
str(data_relevant)
} else {
stop("The variable 'data_relevant' does not exist. Please create it first.")
}
## tibble [14,514 × 11] (S3: tbl_df/tbl/data.frame)
## $ V190: dbl+lbl [1:14514] 5, 5, 5, 5, 5, 3, 3, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3...
## ..@ label : chr "Wealth index combined"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:5] 1 2 3 4 5
## .. ..- attr(*, "names")= chr [1:5] "Lowest" "Second" "Middle" "Fourth" ...
## $ V024: dbl+lbl [1:14514] 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, ...
## ..@ label : chr "Region"
## ..@ format.stata: chr "%2.0f"
## ..@ labels : Named num [1:6] 11 12 13 14 15 16
## .. ..- attr(*, "names")= chr [1:6] "Awdal" " Marodijeh" "Sahil" "Togdheer" ...
## $ V025: dbl+lbl [1:14514] 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## ..@ label : chr "Type of place of residence"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:6] 1 2 3 4 5 6
## .. ..- attr(*, "names")= chr [1:6] "Rural" "Urban" "Nomadic" "Rural IDP" ...
## $ V106: dbl+lbl [1:14514] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## ..@ label : chr "Highest educational level"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:4] 0 1 2 3
## .. ..- attr(*, "names")= chr [1:4] "No Education" "Primary" "Secondary" "Higher"
## $ V152: num [1:14514] 23 23 23 23 23 61 61 23 23 23 ...
## ..- attr(*, "label")= chr "Age of household head"
## ..- attr(*, "format.stata")= chr "%2.0f"
## $ V151: num [1:14514] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "label")= chr "Sex of household head"
## ..- attr(*, "format.stata")= chr "%1.0f"
## $ V136: num [1:14514] 6 6 6 6 6 4 4 6 6 6 ...
## ..- attr(*, "label")= chr "Number of household members (listed)"
## ..- attr(*, "format.stata")= chr "%1.0f"
## $ V201: num [1:14514] 5 5 5 5 5 2 2 4 4 4 ...
## ..- attr(*, "label")= chr "Total children ever born"
## ..- attr(*, "format.stata")= chr "%2.0f"
## $ V501: dbl+lbl [1:14514] 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## ..@ label : chr "Current marital status"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:4] 0 1 2 3
## .. ..- attr(*, "names")= chr [1:4] "Never Married" "Married" "Divorced" "Widowed"
## $ V113: num [1:14514] 11 11 11 11 11 13 13 13 13 13 ...
## ..- attr(*, "label")= chr "Source of drinking water"
## ..- attr(*, "format.stata")= chr "%2.0f"
## $ V116: num [1:14514] 23 23 23 23 23 61 61 23 23 23 ...
## ..- attr(*, "label")= chr "Type of toilet facility"
## ..- attr(*, "format.stata")= chr "%2.0f"
Numeric Variables: These variables contain numerical data and can be used for mathematical operations. They represent continuous or discrete quantities (e.g., age, income). Factor Variables These are categorical variables that represent distinct groups or categories. They are stored as integers internally but have corresponding labels (e.g., gender, marital status).
mean(data_cleaned$V136, na.rm= TRUE)
## [1] 5.407744
median(data_cleaned$V136, na.rm= TRUE)
## [1] 6
sd(data_cleaned$V136, na.rm= TRUE)
## [1] 2.203568
freq_table <- table(data_cleaned$V106)
education_labels <- c("No Education", "Primary", "Secondary", "Higher")
names(freq_table) <- education_labels
print(freq_table)
## No Education Primary Secondary Higher
## 12595 1557 278 84
Calculate the proportion of households in each wealth quintile (“V190”):
proportions_v190 <- prop.table(table(data_cleaned$V190))
# Display the proportions
proportions_v190
##
## 1 2 3 4 5
## 0.3690919 0.1726609 0.1164393 0.1605347 0.1812733
# Assign descriptive labels to the wealth quintiles
data_cleaned$V190 <- factor(data_cleaned$V190,
levels = c(1, 2, 3, 4, 5),
labels = c("Lowest", "Second", "Middle", "Fourth", "Highest"))
# Recalculate proportions with labels
labeled_proportions <- prop.table(table(data_cleaned$V190))
# Display labeled proportions
labeled_proportions
##
## Lowest Second Middle Fourth Highest
## 0.3690919 0.1726609 0.1164393 0.1605347 0.1812733
age_household_head <- data_cleaned$V151
num_living_children <- data_cleaned$V201
correlation <- cor(age_household_head, num_living_children, use = "complete.obs")
cat("Correlation Coefficient between age of household head and number of living children:", correlation, "\n")
## Correlation Coefficient between age of household head and number of living children: 0.005892517
To calculate the correlation coefficient between the household head’s age (V151) and the number of living children (V201) in R, use the cor() function. The default Pearson’s correlation measures the linear relationship. For instance, cor(data_relevant\(V151, data_relevant\)V201, use = “complete.obs”) computes the correlation, yielding a value between -1 and 1, indicating the relationship’s strength and direction.
#1. Create a new variable called poverty_status based on the V190 variable (wealth quintile):
# Step 1: Ensure V190 is correctly converted to numeric
# Check the structure of V190
str(data_relevant$V190)
## dbl+lbl [1:14514] 5, 5, 5, 5, 5, 3, 3, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,...
## @ label : chr "Wealth index combined"
## @ format.stata: chr "%1.0f"
## @ labels : Named num [1:5] 1 2 3 4 5
## ..- attr(*, "names")= chr [1:5] "Lowest" "Second" "Middle" "Fourth" ...
# Convert V190 to numeric (if it's a factor or character)
data_relevant$V190 <- as.numeric(as.character(data_relevant$V190))
# Verify that V190 has valid numeric values
summary(data_relevant$V190)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 2.612 4.000 5.000
# Step 2: Check for missing values in V190 and remove them
data_relevant <- data_relevant[!is.na(data_relevant$V190), ]
# Step 3: Create the poverty_status variable
data_relevant$poverty_status <- ifelse(data_relevant$V190 <= 3, 1, 2)
# Step 4: Label the poverty_status variable
data_relevant$poverty_status <- factor(data_relevant$poverty_status,
levels = c(1, 2),
labels = c("Poor", "Non-Poor"))
# Step 5: Verify the new variable
table(data_relevant$poverty_status)
##
## Poor Non-Poor
## 9553 4961
summary(data_relevant)
## V190 V024 V025 V106
## Min. :1.000 Min. :11.00 Min. :1.000 Min. :0.0000
## 1st Qu.:1.000 1st Qu.:13.00 1st Qu.:1.000 1st Qu.:0.0000
## Median :2.000 Median :14.00 Median :1.000 Median :0.0000
## Mean :2.612 Mean :14.02 Mean :1.473 Mean :0.1629
## 3rd Qu.:4.000 3rd Qu.:15.00 3rd Qu.:2.000 3rd Qu.:0.0000
## Max. :5.000 Max. :16.00 Max. :2.000 Max. :3.0000
## V152 V151 V136 V201
## Min. :11.00 Min. :1.000 Min. :1.000 Min. : 0.000
## 1st Qu.:13.00 1st Qu.:1.000 1st Qu.:4.000 1st Qu.: 4.000
## Median :22.00 Median :1.000 Median :6.000 Median : 6.000
## Mean :26.79 Mean :1.397 Mean :5.408 Mean : 6.253
## 3rd Qu.:23.00 3rd Qu.:2.000 3rd Qu.:7.000 3rd Qu.: 8.000
## Max. :96.00 Max. :2.000 Max. :9.000 Max. :19.000
## V501 V113 V116 poverty_status
## Min. :1.00 Min. :11.00 Min. :11.00 Poor :9553
## 1st Qu.:1.00 1st Qu.:12.00 1st Qu.:13.00 Non-Poor:4961
## Median :1.00 Median :31.00 Median :22.00
## Mean :1.14 Mean :34.44 Mean :26.79
## 3rd Qu.:1.00 3rd Qu.:61.00 3rd Qu.:23.00
## Max. :3.00 Max. :96.00 Max. :96.00
data_cleaned$V113_recode <- ifelse(data_cleaned$V113 %in% c(11, 12, 13, 21, 31, 41),
"Improved", "Unimproved")
# Verify the classification
table(data_cleaned$V113_recode)
##
## Improved Unimproved
## 7474 7040
# Classify Type of Toilet Facility (V116)
data_cleaned$V116_recode <- ifelse(data_cleaned$V116 %in% c(11, 12, 13, 14, 15, 21, 41),
"Improved", "Unimproved")
# Verify the classification
table(data_cleaned$V116_recode)
##
## Improved Unimproved
## 6249 8265
Remove Missing Values: If the number of missing values is small, you can remove the rows containing missing data using na.omit() or filtering techniques. Impute Missing Values: For numeric variables, missing values can be replaced with the mean, median, or other statistical estimates. For categorical variables, the mode or a predicted value based on other variables can be used. Flag Missing Values: Create a flag variable to indicate missing data for further analysis or reporting.
# Create a histogram for V136
hist(data_relevant$V136,
main = "Distribution of Number of Household Members",
xlab = "Number of Household Members",
ylab = "Frequency",
col = "green",
breaks = 10)
2. Create a bar chart to visualize the proportion of houeseholds in each
poverty status category (“poverty_status”)
# Create a bar chart for poverty_status
# Step 1: Calculate proportions for poverty_status
poverty_status_proportions <- prop.table(table(data_relevant$poverty_status))
# Step 2: Create the bar chart
barplot(
poverty_status_proportions,
main = "Proportion of Households by Poverty Status",
xlab = "Poverty Status",
ylab = "Proportion",
col = c("cyan", "black"), # Colors for the bars
names.arg = c("Poor", "Non-Poor") # Label categories
)
library(ggplot2)
boxplot(data_relevant$V201 ~ data_relevant$poverty_status,
main = "Number of Living Children by Poverty Status",
xlab = "Poverty Status",
ylab = "Number of Living Children",
col = c("yellow","blue"))
4. Briefly Explain the importance of Choosing appropriate visualization
techniques for different types of data.
#Explanation: Appropriate visualization techniques help convey the data insights effectively. For example: Histograms are suitable for understanding the distribution of continuous data (e.g., V136). Bar charts effectively display proportions or counts for categorical variables (e.g., poverty_status). *oxplots are ideal for comparing distributions across groups and identifying outliers (e.g., V201 by poverty_status).