#Load necessary package
library(haven)
#Importing a data set
household_eicv<-read_dta("C:/Users/PatrickNdacyayisenga/Downloads/Microdata (1)/Cross_Section/CS_S01_S5_S7_Household.dta")
#Necessary package
library(readr)
# Import CSV file
expenditure_eicv<-read_csv("C:/Users/PatrickNdacyayisenga/Downloads/Microdata (1)/Cross_Section/expenditure.csv")
## Rows: 1159158 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): province, district, ur, s8a1q0, s8a1q2, s8a1q4, s8a1q5, epov_jan, ...
## dbl (6): hhid, clust, s8a1q3, s8a1q6, strata_id, weight
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Necessary packages
library(DBI)
library(RMySQL)
# Connect to MySQL database
con <- dbConnect(RMySQL::MySQL(),
dbname = "secondary_data",
host = "localhost",
port = 3306,
user = "root",
password = "Musa#ga@7")
#Read a table from MySQL into R
data_mysql <- dbReadTable(con, "hospitals")
#Importing poverty file(EICV7) to be merged with Household file(EICV7)
poverty_eicv <- read_dta("C:/Users/PatrickNdacyayisenga/Downloads/Microdata (1)/Cross_Section/CS_EICV7_poverty_file.dta")
# exploring both datasets to see the variable they have in common
variable.names(household_eicv)
## [1] "hhid" "clust" "province" "district" "ur" "s5aq1"
## [7] "s5aq2" "s5aq3" "s5aq4y" "s5aq4m" "s5aq5" "s5aq6"
## [13] "s5aq7" "s5aq8" "s5bq1" "s5bq2a" "s5bq2b" "s5bq3a"
## [19] "s5bq3b" "s5bq4" "s5bq5a" "s5bq5b" "s5bq6" "s5bq7"
## [25] "s5bq8" "s5bq9" "s5cq1" "s5cq2a" "s5cq2b" "s5cq3"
## [31] "s5cq4b" "s5cq5" "s5cq6a" "s5cq6b" "s5cq7" "s5cq8"
## [37] "s5cq9a" "s5cq9b" "s5cq10" "s5cq11" "s5cq12" "s5cq13"
## [43] "s5cq14" "s5cq15" "s5cq16" "s5cq17" "s5cq18" "s5cq19"
## [49] "s5cq20" "s5cq20a" "s5cq20b" "s5cq20c" "s5cq21" "s5cq22a"
## [55] "s5cq22b" "s5cq22c" "s5cq23" "s5cq24" "s5cq25" "s5cq26"
## [61] "s5cq27" "s5cq28" "s5cq29" "s5cq30" "s5cq31" "s5cq32"
## [67] "s5cq33" "s5cq34" "s5dq1" "s5dq2" "s5dq3" "s5dq4"
## [73] "s5eq1" "s5eq2a" "s5eq2b" "s5eq2c" "s5eq2aa_m" "s5eq2ab_m"
## [79] "s5eq2ac_m" "s5eq2aa_y" "s5eq2ab_y" "s5eq2ac_y" "s5eq3a" "s5eq3b"
## [85] "s5eq3c" "s5eq4a" "s5eq4b" "s5eq4c" "s7aq4" "s7aq4a"
## [91] "s7aq4b" "s7aq4c" "s7aq4d" "s7aq4e" "s7aq4f" "s7a2q1"
## [97] "s7a2q2" "s7a2q3" "s7a2q4" "s7a2q5a" "s7a2q6a" "s7a2q6b"
## [103] "s7a2q6c" "s7a2q7" "quintile" "s5cq4a" "strata_id" "weight"
## [109] "pop_wt" "epov_jan" "pov_jan" "poverty"
variable.names(poverty_eicv)
## [1] "hhid" "clust" "province" "district"
## [5] "ur" "ae" "member" "Food_at_school"
## [9] "exp1" "exp2" "exp3" "exp4"
## [13] "exp5" "exp6" "exp7" "exp8"
## [17] "exp9" "exp10" "exp11" "hh_index"
## [21] "weight" "pop_wt" "sol_jan" "quintile"
## [25] "food" "cons1" "cons1ae" "pov_jan"
## [29] "poverty" "epov_jan" "Poverty_line" "Extreme_line"
# merging both datasets: Both datasets have the "hhid" variable in common
merged_data <- merge(household_eicv, poverty_eicv, by = "hhid", all = TRUE)
##HMW3: using select(), filter(), arrange(), mutate(), group_by(), %>%
# selecting location, province and consumption variables needed from EICV7 merged dataset
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
new_df<-merged_data %>%
select(ur.x,province.x, cons1)
# filter for consumption greater than 300,000RWF
new_df<-new_df %>%
filter(cons1>300000)
# arrange consumption by descending order
new_df<-new_df %>%
arrange(desc(cons1))
# Renaming ur.x and province.x variables
new_df<-new_df %>%
rename(location=ur.x, province=province.x)
# finding average consumption by location using group_by()
new_df %>%
mutate(location = as_factor(location)) %>%
group_by(location) %>%
summarise(Average=mean(cons1), na.rm=TRUE)
## # A tibble: 2 Ă— 3
## location Average na.rm
## <fct> <dbl> <lgl>
## 1 Urban 4845601. TRUE
## 2 Rural 1884343. TRUE
# Load ggplot2
library(ggplot2)
# Scatter plot of Sepal.Length vs Sepal.Width, colored by Species
ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width, color = Species)) +
geom_point(size = 3, alpha = 0.7) +
labs(
title = "Sepal Dimensions by Species",
x = "Sepal Length",
y = "Sepal Width"
) +
theme_minimal()
# Boxplot of Petal.Length by Species
ggplot(iris, aes(x = Species, y = Petal.Length, fill = Species)) +
geom_boxplot() +
labs(
title = "Distribution of Petal Length by Species",
x = "Species",
y = "Petal Length"
) +
theme_classic()
# Line plot (mean Petal.Length by Species)
iris_means <- iris %>%
group_by(Species) %>%
summarise(mean_petal = mean(Petal.Length))
ggplot(iris_means, aes(x = Species, y = mean_petal, group = 1)) +
geom_line(color = "blue", size = 1.2) +
geom_point(color = "red", size = 3) +
labs(
title = "Mean Petal Length by Species",
x = "Species",
y = "Mean Petal Length"
) +
theme_light()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Bar plot (average Sepal.Length by Species)
ggplot(iris, aes(x = Species, y = Sepal.Length, fill = Species)) +
geom_bar(stat = "summary", fun = "mean") +
labs(
title = "Average Sepal Length by Species",
x = "Species",
y = "Average Sepal Length"
) +
theme_classic()
trace() lets you insert debugging code into a function
so you can see what happens when it runs. It is useful for checking the
flow of execution inside existing functions.
# Example: trace the sum function
trace(sum, quote(cat("Input values:", sys.call()[[2]], "\n")), at = 1)
## Tracing function "sum" in package "base"
## [1] "sum"
# When we call sum, it will print the traced message first
sum(c(1, 2, 3, 4))
## Tracing sum(c(1, 2, 3, 4)) step 1
## Input values: expr
## [1] 10
# Remove tracing afterwards
untrace(sum)
## Untracing function "sum" in package "base"
###recover() recover() is used when an error happens. It
lets you enter the environment where the error occurred and inspect
variables.
### recover()
# Turn on recover mode
options(error = recover)
bad_fun <- function(x) {
y <- x + "a" # invalid operation
return(y)
}
# Catch the error so knitting doesn’t halt
try(bad_fun(10))
## Error in x + "a" : non-numeric argument to binary operator
##HMW6: FUnction to find summary statistics
The my_summary function calculates basic summary
statistics for a numeric dataset.
It gives the minimum, maximum, mean, median, and the first, second, and
third quartiles.
This makes it easy to understand the main features of the data in a
clear way.
my_summary <- function(x) {
x <- x[!is.na(x)]
sorted_x <- sort(x)
n <- length(sorted_x)
mean_val <- sum(sorted_x) / n
if (n %% 2 == 1) {
median_val <- sorted_x[(n + 1) / 2]
} else {
median_val <- (sorted_x[n/2] + sorted_x[n/2 + 1]) / 2
}
q1 <- sorted_x[round(n * 0.25)]
q2 <- median_val
q3 <- sorted_x[round(n * 0.75)]
stats <- list(
min = sorted_x[1],
q1 = q1,
median = q2,
q3 = q3,
max = sorted_x[n],
mean = mean_val
)
return(stats)
}
# Example
data <- c(5, 8, 10, 12, NA, 15)
my_summary(data)
## $min
## [1] 5
##
## $q1
## [1] 5
##
## $median
## [1] 10
##
## $q3
## [1] 12
##
## $max
## [1] 15
##
## $mean
## [1] 10
sapplycan quickly compute statistics across columns.
data("mtcars")
# Mean of each column in mtcars
sapply(mtcars, mean)
## mpg cyl disp hp drat wt qsec
## 20.090625 6.187500 230.721875 146.687500 3.596563 3.217250 17.848750
## vs am gear carb
## 0.437500 0.406250 3.687500 2.812500
vapply is similar to sapply, but you specify the output
type.
# Mean of each column in mtcars, expecting numeric output
vapply(mtcars, mean, numeric(1))
## mpg cyl disp hp drat wt qsec
## 20.090625 6.187500 230.721875 146.687500 3.596563 3.217250 17.848750
## vs am gear carb
## 0.437500 0.406250 3.687500 2.812500
lapplyalways returns a list, useful for more complex
operations.
# Standard deviation of each column in iris (numeric only)
lapply(iris[,1:4], sd)
## $Sepal.Length
## [1] 0.8280661
##
## $Sepal.Width
## [1] 0.4358663
##
## $Petal.Length
## [1] 1.765298
##
## $Petal.Width
## [1] 0.7622377
#mapply mapply applies a function to multiple vectors at
once.
# Add Sepal.Length and Sepal.Width element-wise
mapply(sum, iris$Sepal.Length, iris$Sepal.Width)
## [1] 8.6 7.9 7.9 7.7 8.6 9.3 8.0 8.4 7.3 8.0 9.1 8.2 7.8 7.3 9.8
## [16] 10.1 9.3 8.6 9.5 8.9 8.8 8.8 8.2 8.4 8.2 8.0 8.4 8.7 8.6 7.9
## [31] 7.9 8.8 9.3 9.7 8.0 8.2 9.0 8.5 7.4 8.5 8.5 6.8 7.6 8.5 8.9
## [46] 7.8 8.9 7.8 9.0 8.3 10.2 9.6 10.0 7.8 9.3 8.5 9.6 7.3 9.5 7.9
## [61] 7.0 8.9 8.2 9.0 8.5 9.8 8.6 8.5 8.4 8.1 9.1 8.9 8.8 8.9 9.3
## [76] 9.6 9.6 9.7 8.9 8.3 7.9 7.9 8.5 8.7 8.4 9.4 9.8 8.6 8.6 8.0
## [91] 8.1 9.1 8.4 7.3 8.3 8.7 8.6 9.1 7.6 8.5 9.6 8.5 10.1 9.2 9.5
## [106] 10.6 7.4 10.2 9.2 10.8 9.7 9.1 9.8 8.2 8.6 9.6 9.5 11.5 10.3 8.2
## [121] 10.1 8.4 10.5 9.0 10.0 10.4 9.0 9.1 9.2 10.2 10.2 11.7 9.2 9.1 8.7
## [136] 10.7 9.7 9.5 9.0 10.0 9.8 10.0 8.5 10.0 10.0 9.7 8.8 9.5 9.6 8.9
tapply applies a function to subsets of data defined by
a factor.
# Mean Sepal.Length grouped by Species
tapply(iris$Sepal.Length, iris$Species, mean)
## setosa versicolor virginica
## 5.006 5.936 6.588
# A function to perform a two-sample t-test
my_ttest <- function(x, y, equal_var = TRUE) {
# Remove missing values
x <- x[!is.na(x)]
y <- y[!is.na(y)]
# Perform t-test
result <- t.test(x, y, var.equal = equal_var)
# Return key results
output <- list(
mean_x = mean(x),
mean_y = mean(y),
t_statistic = result$statistic,
df = result$parameter,
p_value = result$p.value,
conf_int = result$conf.int,
conclusion = ifelse(result$p.value < 0.05,
"Reject null hypothesis: means differ",
"Fail to reject null hypothesis: no evidence of difference")
)
return(output)
}
# Example using iris dataset: compare Sepal.Length between two species
setosa <- iris$Sepal.Length[iris$Species == "setosa"]
versicolor <- iris$Sepal.Length[iris$Species == "versicolor"]
my_ttest(setosa, versicolor)
## $mean_x
## [1] 5.006
##
## $mean_y
## [1] 5.936
##
## $t_statistic
## t
## -10.52099
##
## $df
## df
## 98
##
## $p_value
## [1] 8.985235e-18
##
## $conf_int
## [1] -1.1054165 -0.7545835
## attr(,"conf.level")
## [1] 0.95
##
## $conclusion
## [1] "Reject null hypothesis: means differ"