HW1: Import data from Statistical Package and from Database Management

1. A Statistical package: Stata

#Load necessary package

library(haven)

#Importing a data set

household_eicv<-read_dta("C:/Users/PatrickNdacyayisenga/Downloads/Microdata (1)/Cross_Section/CS_S01_S5_S7_Household.dta")

2. A Statistical package: Excel

#Necessary package

library(readr)

# Import CSV file 

expenditure_eicv<-read_csv("C:/Users/PatrickNdacyayisenga/Downloads/Microdata (1)/Cross_Section/expenditure.csv")
## Rows: 1159158 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): province, district, ur, s8a1q0, s8a1q2, s8a1q4, s8a1q5, epov_jan, ...
## dbl  (6): hhid, clust, s8a1q3, s8a1q6, strata_id, weight
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

3. A Database management system: MySQL

#Necessary packages

library(DBI)
library(RMySQL)

# Connect to MySQL database

con <- dbConnect(RMySQL::MySQL(),
                 dbname = "secondary_data",
                 host = "localhost",
                 port = 3306,
                 user = "root",
                 password = "Musa#ga@7")

#Read a table from MySQL into R

data_mysql <- dbReadTable(con, "hospitals")

HMW2: Merging datasets

#Importing poverty file(EICV7) to be merged with Household file(EICV7)

poverty_eicv <- read_dta("C:/Users/PatrickNdacyayisenga/Downloads/Microdata (1)/Cross_Section/CS_EICV7_poverty_file.dta")

# exploring both datasets to see the variable they have in common

variable.names(household_eicv)
##   [1] "hhid"      "clust"     "province"  "district"  "ur"        "s5aq1"    
##   [7] "s5aq2"     "s5aq3"     "s5aq4y"    "s5aq4m"    "s5aq5"     "s5aq6"    
##  [13] "s5aq7"     "s5aq8"     "s5bq1"     "s5bq2a"    "s5bq2b"    "s5bq3a"   
##  [19] "s5bq3b"    "s5bq4"     "s5bq5a"    "s5bq5b"    "s5bq6"     "s5bq7"    
##  [25] "s5bq8"     "s5bq9"     "s5cq1"     "s5cq2a"    "s5cq2b"    "s5cq3"    
##  [31] "s5cq4b"    "s5cq5"     "s5cq6a"    "s5cq6b"    "s5cq7"     "s5cq8"    
##  [37] "s5cq9a"    "s5cq9b"    "s5cq10"    "s5cq11"    "s5cq12"    "s5cq13"   
##  [43] "s5cq14"    "s5cq15"    "s5cq16"    "s5cq17"    "s5cq18"    "s5cq19"   
##  [49] "s5cq20"    "s5cq20a"   "s5cq20b"   "s5cq20c"   "s5cq21"    "s5cq22a"  
##  [55] "s5cq22b"   "s5cq22c"   "s5cq23"    "s5cq24"    "s5cq25"    "s5cq26"   
##  [61] "s5cq27"    "s5cq28"    "s5cq29"    "s5cq30"    "s5cq31"    "s5cq32"   
##  [67] "s5cq33"    "s5cq34"    "s5dq1"     "s5dq2"     "s5dq3"     "s5dq4"    
##  [73] "s5eq1"     "s5eq2a"    "s5eq2b"    "s5eq2c"    "s5eq2aa_m" "s5eq2ab_m"
##  [79] "s5eq2ac_m" "s5eq2aa_y" "s5eq2ab_y" "s5eq2ac_y" "s5eq3a"    "s5eq3b"   
##  [85] "s5eq3c"    "s5eq4a"    "s5eq4b"    "s5eq4c"    "s7aq4"     "s7aq4a"   
##  [91] "s7aq4b"    "s7aq4c"    "s7aq4d"    "s7aq4e"    "s7aq4f"    "s7a2q1"   
##  [97] "s7a2q2"    "s7a2q3"    "s7a2q4"    "s7a2q5a"   "s7a2q6a"   "s7a2q6b"  
## [103] "s7a2q6c"   "s7a2q7"    "quintile"  "s5cq4a"    "strata_id" "weight"   
## [109] "pop_wt"    "epov_jan"  "pov_jan"   "poverty"
variable.names(poverty_eicv) 
##  [1] "hhid"           "clust"          "province"       "district"      
##  [5] "ur"             "ae"             "member"         "Food_at_school"
##  [9] "exp1"           "exp2"           "exp3"           "exp4"          
## [13] "exp5"           "exp6"           "exp7"           "exp8"          
## [17] "exp9"           "exp10"          "exp11"          "hh_index"      
## [21] "weight"         "pop_wt"         "sol_jan"        "quintile"      
## [25] "food"           "cons1"          "cons1ae"        "pov_jan"       
## [29] "poverty"        "epov_jan"       "Poverty_line"   "Extreme_line"
# merging both datasets: Both datasets have the "hhid" variable in common

merged_data <- merge(household_eicv, poverty_eicv, by = "hhid", all = TRUE)

##HMW3: using select(), filter(), arrange(), mutate(), group_by(), %>%

# selecting location, province and consumption variables  needed from EICV7 merged dataset

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
new_df<-merged_data %>% 
  select(ur.x,province.x, cons1)

# filter for consumption greater than 300,000RWF

new_df<-new_df %>% 
  filter(cons1>300000)

# arrange consumption by descending order

new_df<-new_df %>% 
  arrange(desc(cons1))

# Renaming ur.x and province.x variables

new_df<-new_df %>% 
  rename(location=ur.x, province=province.x)

# finding average consumption by location using group_by()

new_df %>% 
  mutate(location = as_factor(location)) %>% 
  group_by(location) %>% 
  summarise(Average=mean(cons1), na.rm=TRUE)
## # A tibble: 2 Ă— 3
##   location  Average na.rm
##   <fct>       <dbl> <lgl>
## 1 Urban    4845601. TRUE 
## 2 Rural    1884343. TRUE

## HMW4: Visualization with ggplot2

# Load ggplot2
library(ggplot2)

# Scatter plot of Sepal.Length vs Sepal.Width, colored by Species
ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width, color = Species)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(
    title = "Sepal Dimensions by Species",
    x = "Sepal Length",
    y = "Sepal Width"
  ) +
  theme_minimal()

# Boxplot of Petal.Length by Species
ggplot(iris, aes(x = Species, y = Petal.Length, fill = Species)) +
  geom_boxplot() +
  labs(
    title = "Distribution of Petal Length by Species",
    x = "Species",
    y = "Petal Length"
  ) +
  theme_classic()

# Line plot (mean Petal.Length by Species)
iris_means <- iris %>%
  group_by(Species) %>%
  summarise(mean_petal = mean(Petal.Length))

ggplot(iris_means, aes(x = Species, y = mean_petal, group = 1)) +
  geom_line(color = "blue", size = 1.2) +
  geom_point(color = "red", size = 3) +
  labs(
    title = "Mean Petal Length by Species",
    x = "Species",
    y = "Mean Petal Length"
  ) +
  theme_light()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Bar plot (average Sepal.Length by Species)
ggplot(iris, aes(x = Species, y = Sepal.Length, fill = Species)) +
  geom_bar(stat = "summary", fun = "mean") +
  labs(
    title = "Average Sepal Length by Species",
    x = "Species",
    y = "Average Sepal Length"
  ) +
  theme_classic()

HMW5: Using trace() and recover() in R

trace()

trace() lets you insert debugging code into a function so you can see what happens when it runs. It is useful for checking the flow of execution inside existing functions.

# Example: trace the sum function
trace(sum, quote(cat("Input values:", sys.call()[[2]], "\n")), at = 1)
## Tracing function "sum" in package "base"
## [1] "sum"
# When we call sum, it will print the traced message first
sum(c(1, 2, 3, 4))
## Tracing sum(c(1, 2, 3, 4)) step 1 
## Input values: expr
## [1] 10
# Remove tracing afterwards
untrace(sum)
## Untracing function "sum" in package "base"

###recover() recover() is used when an error happens. It lets you enter the environment where the error occurred and inspect variables.

### recover()
# Turn on recover mode

options(error = recover)

bad_fun <- function(x) {
  y <- x + "a"   # invalid operation
  return(y)
}

# Catch the error so knitting doesn’t halt
try(bad_fun(10))
## Error in x + "a" : non-numeric argument to binary operator

##HMW6: FUnction to find summary statistics

The my_summary function calculates basic summary statistics for a numeric dataset.
It gives the minimum, maximum, mean, median, and the first, second, and third quartiles.
This makes it easy to understand the main features of the data in a clear way.

my_summary <- function(x) {
  x <- x[!is.na(x)]
  sorted_x <- sort(x)
  n <- length(sorted_x)
  
  mean_val <- sum(sorted_x) / n
  
  if (n %% 2 == 1) {
    median_val <- sorted_x[(n + 1) / 2]
  } else {
    median_val <- (sorted_x[n/2] + sorted_x[n/2 + 1]) / 2
  }
  
  q1 <- sorted_x[round(n * 0.25)]
  q2 <- median_val
  q3 <- sorted_x[round(n * 0.75)]
  
  stats <- list(
    min = sorted_x[1],
    q1 = q1,
    median = q2,
    q3 = q3,
    max = sorted_x[n],
    mean = mean_val
  )
  
  return(stats)
}

# Example
data <- c(5, 8, 10, 12, NA, 15)
my_summary(data)
## $min
## [1] 5
## 
## $q1
## [1] 5
## 
## $median
## [1] 10
## 
## $q3
## [1] 12
## 
## $max
## [1] 15
## 
## $mean
## [1] 10

HMW7: Apply Functions in R

sapply

sapplycan quickly compute statistics across columns.

data("mtcars")
# Mean of each column in mtcars
sapply(mtcars, mean)
##        mpg        cyl       disp         hp       drat         wt       qsec 
##  20.090625   6.187500 230.721875 146.687500   3.596563   3.217250  17.848750 
##         vs         am       gear       carb 
##   0.437500   0.406250   3.687500   2.812500

vapply

vapply is similar to sapply, but you specify the output type.

# Mean of each column in mtcars, expecting numeric output
vapply(mtcars, mean, numeric(1))
##        mpg        cyl       disp         hp       drat         wt       qsec 
##  20.090625   6.187500 230.721875 146.687500   3.596563   3.217250  17.848750 
##         vs         am       gear       carb 
##   0.437500   0.406250   3.687500   2.812500

lapply

lapplyalways returns a list, useful for more complex operations.

# Standard deviation of each column in iris (numeric only)
lapply(iris[,1:4], sd)
## $Sepal.Length
## [1] 0.8280661
## 
## $Sepal.Width
## [1] 0.4358663
## 
## $Petal.Length
## [1] 1.765298
## 
## $Petal.Width
## [1] 0.7622377

#mapply mapply applies a function to multiple vectors at once.

# Add Sepal.Length and Sepal.Width element-wise
mapply(sum, iris$Sepal.Length, iris$Sepal.Width)
##   [1]  8.6  7.9  7.9  7.7  8.6  9.3  8.0  8.4  7.3  8.0  9.1  8.2  7.8  7.3  9.8
##  [16] 10.1  9.3  8.6  9.5  8.9  8.8  8.8  8.2  8.4  8.2  8.0  8.4  8.7  8.6  7.9
##  [31]  7.9  8.8  9.3  9.7  8.0  8.2  9.0  8.5  7.4  8.5  8.5  6.8  7.6  8.5  8.9
##  [46]  7.8  8.9  7.8  9.0  8.3 10.2  9.6 10.0  7.8  9.3  8.5  9.6  7.3  9.5  7.9
##  [61]  7.0  8.9  8.2  9.0  8.5  9.8  8.6  8.5  8.4  8.1  9.1  8.9  8.8  8.9  9.3
##  [76]  9.6  9.6  9.7  8.9  8.3  7.9  7.9  8.5  8.7  8.4  9.4  9.8  8.6  8.6  8.0
##  [91]  8.1  9.1  8.4  7.3  8.3  8.7  8.6  9.1  7.6  8.5  9.6  8.5 10.1  9.2  9.5
## [106] 10.6  7.4 10.2  9.2 10.8  9.7  9.1  9.8  8.2  8.6  9.6  9.5 11.5 10.3  8.2
## [121] 10.1  8.4 10.5  9.0 10.0 10.4  9.0  9.1  9.2 10.2 10.2 11.7  9.2  9.1  8.7
## [136] 10.7  9.7  9.5  9.0 10.0  9.8 10.0  8.5 10.0 10.0  9.7  8.8  9.5  9.6  8.9

tapply

tapply applies a function to subsets of data defined by a factor.

# Mean Sepal.Length grouped by Species
tapply(iris$Sepal.Length, iris$Species, mean)
##     setosa versicolor  virginica 
##      5.006      5.936      6.588
# A function to perform a two-sample t-test
my_ttest <- function(x, y, equal_var = TRUE) {
  # Remove missing values
  x <- x[!is.na(x)]
  y <- y[!is.na(y)]
  
  # Perform t-test
  result <- t.test(x, y, var.equal = equal_var)
  
  # Return key results
  output <- list(
    mean_x = mean(x),
    mean_y = mean(y),
    t_statistic = result$statistic,
    df = result$parameter,
    p_value = result$p.value,
    conf_int = result$conf.int,
    conclusion = ifelse(result$p.value < 0.05,
                        "Reject null hypothesis: means differ",
                        "Fail to reject null hypothesis: no evidence of difference")
  )
  
  return(output)
}

# Example using iris dataset: compare Sepal.Length between two species
setosa <- iris$Sepal.Length[iris$Species == "setosa"]
versicolor <- iris$Sepal.Length[iris$Species == "versicolor"]

my_ttest(setosa, versicolor)
## $mean_x
## [1] 5.006
## 
## $mean_y
## [1] 5.936
## 
## $t_statistic
##         t 
## -10.52099 
## 
## $df
## df 
## 98 
## 
## $p_value
## [1] 8.985235e-18
## 
## $conf_int
## [1] -1.1054165 -0.7545835
## attr(,"conf.level")
## [1] 0.95
## 
## $conclusion
## [1] "Reject null hypothesis: means differ"