HW1: Import data from Statistical Package and from Database Management

1. A Statistical package: Stata

#Load necessary package

library(haven)

#Importing a data set
CLARE_Quant_final_clean_1_ <- read_dta("CLARE_Quant_final_clean (1).dta")

2. A Statistical package: Excel

#Necessary package

library(readr)

# Import CSV file 
CO2_emission <- read_csv("CO2_emission.csv")
## New names:
## Rows: 215 Columns: 35
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (4): Country Name, country_code, Region, Indicator Name dbl (31): 1990, 1991,
## 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, ...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `2019` -> `2019...34`
## • `2019` -> `2019...35`

3. A Database management system: MySQL

#Necessary packages

library(DBI)
library(RMySQL)

# Connect to MySQL database

con <- dbConnect(RMySQL::MySQL(),
                 dbname = "secondary_data",
                 host = "localhost",
                 port = 3306,
                 user = "root",
                 password = "Musa#ga@7")

#Read a table from MySQL into R

data_mysql <- dbReadTable(con, "`financial sector`")

HMW2: Merging datasets

#Importing datasets for merging

library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ purrr     1.2.2
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.3     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
t_labeled <- read_excel("t_labeled.xlsx")

t_labeled<-t_labeled %>% 
  select(uniqueid, c02)

TUNGA_Field_Survey_V1 <- read_dta("TUNGA Field Survey V1.dta")





# merging both datasets Tunga and t_labeled using uniqueid


Tunga_merged <- merge(TUNGA_Field_Survey_V1, t_labeled, by = "uniqueid", all = TRUE)

##HMW3: using select(), filter(), arrange(), mutate(), group_by(), %>%

#Selecting only district and gender
Tunga_merged<-Tunga_merged %>% 
  select(district,a02a)

# filter for district is Kicukiro

Tunga_merged<-Tunga_merged %>% 
  filter(district=="Kicukiro")

# arrange sex to be before district

Tunga_merged<-Tunga_merged %>% 
  arrange(a02a,district)

# Renaming ao2a to sex

Tunga_merged<-Tunga_merged %>% 
  rename(sex=a02a)

# grouping sex by district 

Tunga_merged %>%
  group_by(sex) %>%
  summarise(District_count = n())

HMW4: Using trace() and recover() in R

trace()

The trace() function allows you to insert monitoring code inside an existing function. By doing this, you can observe how the function executes step by step, which makes it easier to understand its internal workflow and debug issues.

# Define a simple function
my_add <- function(x, y) {
  result <- x + y
  return(result)
}

# Trace the function to show inputs
trace(my_add, quote(cat("Tracing my_add(): x =", x, " y =", y, "\n")), at = 1)
## [1] "my_add"
# Call the function
my_add(5, 10)
## Tracing my_add(5, 10) step 1 
## Tracing my_add(): x = 5  y = 10
## [1] 15
# Remove tracing afterwards
untrace(my_add)

###recover() The recover() option activates a special debugging mode whenever an error occurs. Instead of stopping the program outright, it opens a menu that lets you step into the environment where the problem happened. From there, you can inspect the variables and state of the function to better understand what went wrong.

# Enable recover mode
options(error = recover)

# Define a function that will trigger an error
bad_fun <- function(x) {
  stop("This is a forced error for demonstration")  # cleaner than x + "a"
}

# Use tryCatch so the document continues knitting
tryCatch(
  bad_fun(10),
  error = function(e) {
    cat("An error occurred:", e$message, "\n")
  }
)
## An error occurred: This is a forced error for demonstration

##HMW4: Visualization using ggplot

### 1. Bar Plot (geom_bar)

library(ggplot2)

# Bar plot of species counts in iris
ggplot(iris, aes(x = Species)) +
  geom_bar(fill = "steelblue") +
  labs(title = "Count of Iris Species")

# Line plot of pressure dataset
ggplot(pressure, aes(x = temperature, y = pressure)) +
  geom_line(color = "darkred") +
  labs(title = "Pressure vs Temperature")

# Scatter plot of Sepal.Length vs Petal.Length
ggplot(iris, aes(x = Sepal.Length, y = Petal.Length, color = Species)) +
  geom_point(size = 2) +
  labs(title = "Sepal vs Petal Length")

# Histogram of Sepal.Length
ggplot(iris, aes(x = Sepal.Length)) +
  geom_histogram(binwidth = 0.5, fill = "forestgreen", color = "black") +
  labs(title = "Distribution of Sepal Length")

##HMW5: FUnction to find summary statistics

The my_summary function calculates basic summary statistics for a numeric dataset.
It gives the minimum, maximum, mean, median, and the first, second, and third quartiles.
This makes it easy to understand the main features of the data in a clear way.

## HMW5: Function to find summary statistics

quick_stats <- function(x) {
  x <- x[!is.na(x)]
  sorted_x <- sort(x)
  n <- length(sorted_x)
  
  mean_val <- sum(sorted_x) / n
  
  if (n %% 2 == 1) {
    median_val <- sorted_x[(n + 1) / 2]
  } else {
    median_val <- (sorted_x[n/2] + sorted_x[n/2 + 1]) / 2
  }
  
  q1 <- sorted_x[round(n * 0.25)]
  q2 <- median_val
  q3 <- sorted_x[round(n * 0.75)]
  
  stats <- list(
    min = sorted_x[1],
    q1 = q1,
    median = q2,
    q3 = q3,
    max = sorted_x[n],
    mean = mean_val
  )
  
  return(stats)
}

# Example
data <- c(5, 8, 10, 12, NA, 15)
quick_stats(data)
## $min
## [1] 5
## 
## $q1
## [1] 5
## 
## $median
## [1] 10
## 
## $q3
## [1] 12
## 
## $max
## [1] 15
## 
## $mean
## [1] 10

HMW6: Apply Functions in R

sapply

The sapply() function is handy for applying a calculation across multiple elements, such as all columns in a dataset. It simplifies the results into a vector or matrix whenever possible, making it easy to quickly compare statistics across variables.

data("iris")

# Calculate the standard deviation of each numeric column in iris
sapply(iris[, 1:4], sd)
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##    0.8280661    0.4358663    1.7652982    0.7622377

vapply

The vapply() function works like sapply(), but it requires you to declare the expected type of the output in advance. This makes it safer and more predictable, since R will check that the results match the type you specified.

# Calculate the maximum value of each numeric column in iris
vapply(iris[, 1:4], max, numeric(1))
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##          7.9          4.4          6.9          2.5

lapply

The lapply() function applies a given operation to each element of a list or vector and always returns the results as a list. This makes it especially useful when you want to perform more complex calculations and keep the outputs organized in list form.

# Standard deviation of each column in iris (numeric only)
# Find the minimum value of each numeric column in iris
lapply(iris[, 1:4], min)
## $Sepal.Length
## [1] 4.3
## 
## $Sepal.Width
## [1] 2
## 
## $Petal.Length
## [1] 1
## 
## $Petal.Width
## [1] 0.1

#mapply The mapply() function extends the idea of sapply() to multiple inputs. It applies a function element‑by‑element across several vectors at the same time, returning the combined results. This is useful when you want to perform operations that depend on more than one sequence of values.

# Raise elements of one vector to the power of another
vec1 <- c(2, 3, 4)
vec2 <- c(3, 2, 1)

mapply(function(x, y) x ^ y, vec1, vec2)
## [1] 8 9 4

tapply

The tapply() function is used to apply a calculation to subsets of a vector, where the subsets are defined by a factor (a grouping variable). It’s a convenient way to compute statistics separately for each group in your data.

# Calculate the maximum Petal.Width grouped by Species
tapply(iris$Petal.Width, iris$Species, max)
##     setosa versicolor  virginica 
##        0.6        1.8        2.5