HW1: Import data from statistical package and from database management systems

Import data from Statistical Package: haven package

The haven package in R is designed to import and export foreign statistical data formats from SAS,SPSS, and Stata files into R. #### install packages and libraries

#install.packages("haven")
#install.packages("readxl")
#install.packages("readr")
#install.packages("sas7bdat")
library(haven)
library(readxl)
library(readr)
library(sas7bdat)

Import SPSS (.sav)

#Import SPSS (.sav) file in R using haven library
data_spss <- read_sav("C:/Users/LENOVO/OneDrive/Desktop/spss_data.sav")
head(data_spss)

Import dataframe in R Using readxl for Excel files or csv files

data_excell <- read_excel("C:/Users/LENOVO/OneDrive/Desktop/data_excel.xlsx")
head(data_excell)

Import dataframe in R from csv format

data_csv <- read.csv("C:/Users/LENOVO/OneDrive/Desktop/data3.csv")
head(data_csv)

HW2: How dataframes can be marges by considering two or more common factors?

The merge() function in R is used to combine two datasets using one or more common columns. They are different types of merging dataframe like (merge by default, left merge, right merge, merge all). Sometimes the data frame might have different dimensions or matching columns are named in different ways.

Let me start by merge dateframes having the same dimensions ### Dataframe creation or loading datasets in R

#First dataframe
data_1 <- data.frame(
  ID = c(1, 2, 3, 4, 5),
  Names = c("Alice", "John", "Mary","Simon", "Caleb"),
  Gender = c("F","M","F","M","M"),
  Age = c(22,25,19,30,28),
  Country =c("Rwanda", "Kenya","Nigeria","Rwanda", "Tanzania"),
  Major =c("Statistics","IT","Maths","Busines","Medical")
)
#Second dataframe
data_2<-data.frame(
  ID=c(1,2,3,4,5),
  Names = c("Alice", "John", "Mary","Simon", "Caleb"),
  Gender = c("F","M","F","M","M"),
  Reserch_grade = c(18,15,10,16,13),
  Python_grade = c(14,16,11,18,15),
  Alg_grade =c(16,14,17,12,15),
  math_grade = c(15,12,12,17,16),
  R_grade =c(15,18,14,16,13)
)

Merge dataframes by one common factor

# merge two dataframes by one common factor ("ID)
merged_1<-merge(data_1,data_2,by="ID")
head(merged_1, 3)

Merge dataframes by two common factors

# merge two dataframes by one common factor ("ID)
merged_2<-merge(data_1,data_2,  by = c("ID", "Names"))
head(merged_2,3)

Merge dataframes by three common factors

# merge two dataframes by one common factor ("ID)
merged_3<-merge(data_1,data_2,  by = c("ID", "Names","Gender"))
head(merged_3,3)

Merge dataframes with different dimensions and matching columns are having are named in different ways:

# Loading datasets from kaggle website
# 1. World population dataset
World_population <-read.csv("C:/Users/LENOVO/OneDrive/Desktop/Datasets/world_population.csv")
head(World_population,3)
#2. CO2_Emission dataset
co2_Emission <-read.csv("C:/Users/LENOVO/OneDrive/Desktop/Datasets/co2_Emission.csv")
head(co2_Emission,3)

Display the variables in each dataframe

cat("\n======== variable names in world population dataset=====\n")
## 
## ======== variable names in world population dataset=====
variable.names(World_population)
##  [1] "Rank"                        "CCA3"                       
##  [3] "Country.Territory"           "Capital"                    
##  [5] "Continent"                   "X2022.Population"           
##  [7] "X2020.Population"            "X2015.Population"           
##  [9] "X2010.Population"            "X2000.Population"           
## [11] "X1990.Population"            "X1980.Population"           
## [13] "X1970.Population"            "Area..km.."                 
## [15] "Density..per.km.."           "Growth.Rate"                
## [17] "World.Population.Percentage"
cat("\n======== variable names in co2_Emission dataset=====\n")
## 
## ======== variable names in co2_Emission dataset=====
variable.names(co2_Emission)
##  [1] "Country.Name"   "country_code"   "Region"         "Indicator.Name"
##  [5] "X1990"          "X1991"          "X1992"          "X1993"         
##  [9] "X1994"          "X1995"          "X1996"          "X1997"         
## [13] "X1998"          "X1999"          "X2000"          "X2001"         
## [17] "X2002"          "X2003"          "X2004"          "X2005"         
## [21] "X2006"          "X2007"          "X2008"          "X2009"         
## [25] "X2010"          "X2011"          "X2012"          "X2013"         
## [29] "X2014"          "X2015"          "X2016"          "X2017"         
## [33] "X2018"          "X2019"          "X2019.1"

Dimension for each dataframe

cat("\n === Checking the dimension of world_population dataset===\n")
## 
##  === Checking the dimension of world_population dataset===
dim(World_population)
## [1] 234  17
cat("\n=== Checking the dimension of co2_Emission dataset===\n ")
## 
## === Checking the dimension of co2_Emission dataset===
## 
dim(co2_Emission)
## [1] 215  35

Merge two datasets by considering two factors (country name and country code)

In the two datasets the factor country is written in different ways like Country.Territory in world_population and Country.Name in co2_Emission.

First, let me use merge() function by default that keeps only matching rows

# Using default merge: Keeps only matching rows
# Merge by two common factors
merge_pop_co2 <- merge(
  World_population,
  co2_Emission,
  by.x = c("Country.Territory", "CCA3"),
  by.y = c("Country.Name", "country_code")
)
dim(merge_pop_co2)
## [1] 185  50

Data Cleaning

1. Data cleaning for the merged datasets with merge default that keeps only matching rows and drop rows with NA

# Count the total  missing values in each column for the merged dataset.
colSums(is.na(merge_pop_co2))
##           Country.Territory                        CCA3 
##                           0                           0 
##                        Rank                     Capital 
##                           0                           0 
##                   Continent            X2022.Population 
##                           0                           0 
##            X2020.Population            X2015.Population 
##                           0                           0 
##            X2010.Population            X2000.Population 
##                           0                           0 
##            X1990.Population            X1980.Population 
##                           0                           0 
##            X1970.Population                  Area..km.. 
##                           0                           0 
##           Density..per.km..                 Growth.Rate 
##                           0                           0 
## World.Population.Percentage                      Region 
##                           0                           0 
##              Indicator.Name                       X1990 
##                           0                          22 
##                       X1991                       X1992 
##                          21                          19 
##                       X1993                       X1994 
##                          19                          19 
##                       X1995                       X1996 
##                          18                          18 
##                       X1997                       X1998 
##                          18                          19 
##                       X1999                       X2000 
##                          19                          18 
##                       X2001                       X2002 
##                          18                          17 
##                       X2003                       X2004 
##                          17                          17 
##                       X2005                       X2006 
##                          17                          17 
##                       X2007                       X2008 
##                          17                          17 
##                       X2009                       X2010 
##                          17                          17 
##                       X2011                       X2012 
##                          17                          17 
##                       X2013                       X2014 
##                          17                          17 
##                       X2015                       X2016 
##                          17                          17 
##                       X2017                       X2018 
##                          17                          17 
##                       X2019                     X2019.1 
##                          17                          17

Handling the missing values by replacing with mean column value

HW3: How to deal with group-by and %>% in the package of dplyr

  1. What is %>% (pipe operator)? The pipe sends the result of one step into the next step. Let us use the mtcars dataset available in R as the first example for using %>% (pipe).
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Example 1:
data("mtcars")
# Without piping
mean(mtcars$mpg)
## [1] 20.09062
#With pipe
mtcars %>% 
  summarise(mean_mpg = mean(mpg))

Data Cleaning by replacing the missing values by mean using %>%

This is very common in statistics since it preverses datasets size and reduces bias in the dataset.

# Example 2: Handling missing values by replacing NA by mean.
library(dplyr)
clean_data <- merge_pop_co2 %>%
  mutate(
    across(
      where(is.numeric),
      ~ ifelse(is.na(.),
               mean(., na.rm = TRUE),
               .)
    )
  )
#is.na(clean_data1)
colSums(is.na(clean_data))
##           Country.Territory                        CCA3 
##                           0                           0 
##                        Rank                     Capital 
##                           0                           0 
##                   Continent            X2022.Population 
##                           0                           0 
##            X2020.Population            X2015.Population 
##                           0                           0 
##            X2010.Population            X2000.Population 
##                           0                           0 
##            X1990.Population            X1980.Population 
##                           0                           0 
##            X1970.Population                  Area..km.. 
##                           0                           0 
##           Density..per.km..                 Growth.Rate 
##                           0                           0 
## World.Population.Percentage                      Region 
##                           0                           0 
##              Indicator.Name                       X1990 
##                           0                           0 
##                       X1991                       X1992 
##                           0                           0 
##                       X1993                       X1994 
##                           0                           0 
##                       X1995                       X1996 
##                           0                           0 
##                       X1997                       X1998 
##                           0                           0 
##                       X1999                       X2000 
##                           0                           0 
##                       X2001                       X2002 
##                           0                           0 
##                       X2003                       X2004 
##                           0                           0 
##                       X2005                       X2006 
##                           0                           0 
##                       X2007                       X2008 
##                           0                           0 
##                       X2009                       X2010 
##                           0                           0 
##                       X2011                       X2012 
##                           0                           0 
##                       X2013                       X2014 
##                           0                           0 
##                       X2015                       X2016 
##                           0                           0 
##                       X2017                       X2018 
##                           0                           0 
##                       X2019                     X2019.1 
##                           0                           0

using group_by from dplyr package

What is group_by()? It splits your data into groups so operations are done within each group. Here, we want to calculate the total population by continent

#Example 1: Total population by continent
clean_data %>%
  group_by(Continent) %>%
  summarise(
    total_population = sum(X2022.Population, na.rm = TRUE)
  )

Use of select(), filter(), rename(), arrange(), group_by(), and %>%

library(dplyr)

result <- World_population %>%
  
  # 1. Rename columns (example)
  rename(
    Country = Country.Territory,
    Pop_2022 = X2022.Population
  ) %>%
  
  # 2. Select only useful columns
  select(Country, CCA3, Continent, Pop_2022) %>%
  
  # 3. Filter (example: keep large population countries)
  filter(Pop_2022 > 50000000) %>%
  
  # 4. Create new variable
  mutate(
    Pop_Millions = Pop_2022 / 1000000
  ) %>%
  
  # 5. Group by continent
  group_by(Continent) %>%
  
  # 6. Summarise (important step after group_by)
  summarise(
    avg_population = mean(Pop_Millions, na.rm = TRUE),
    total_population = sum(Pop_2022, na.rm = TRUE),
    countries = n()
  ) %>%
  
  # 7. Arrange results
  arrange(desc(total_population))

# View result
result

HW4: Hoe to use trace() and recover() functions in R? using online learning

  1. trace()
  2. recover() Apply the two functions on real examples

Example 1: Function that performs some operations

divide_num <- function(a, b) {
  results <- (a / b)+5  # R returns Inf here if b is 0
  
  if (is.infinite(results)) {
    return("Error: divide a number by zero is not possible")
  } else {
    return(results)
  }
}
divide_num(3,0)  
## [1] "Error: divide a number by zero is not possible"

Using trace()

The trace() function helps you see what happens inside a function while it is running. We pply trace() to watch execution of the function created.

#trace(what, tracer)
trace(
  divide_num,
  tracer = quote(paste("Function divide_num is running: Please wait!\n"))
)
## [1] "divide_num"
divide_num(10, 2)
## Tracing divide_num(10, 2) on entry
## [1] 10

Add recover() to the function for

divide_num <- function(a,b) {

  if (b == 0) {
    stop("Error: divide a number by zero is not possible")
  }
  results <- (a/b)
  return(results)
}
# Activate recover mode
options(error = recover)

# Run the function
divide_num(3,2)
## [1] 1.5

Combination of trace() and recover() functions

# Define the function
divide_num <- function(a,b) {
  if (b == 0) {
    stop("Error: divide a number by zero is not possible")
  }
  results <- (a/b+2)
  return(results)
}
# Add tracing to the function
trace(
  divide_num,
  tracer = quote(cat("Tracing: a =", a, " b =", b, "\n"))
)
## [1] "divide_num"
# Activate recover mode
options(error = recover)

# Run the function
divide_num(3,0)
## Tracing divide_num(3, 0) on entry 
## Tracing: a = 3  b = 0
## Error in divide_num(3, 0): Error: divide a number by zero is not possible

Example 2: Function that calculate the square root of any real number

# Step 1: Creating a function
sqrt_num <- function(x) {
  if (x < 0) {
    return("Error: square root of a negative number is not possible under real numbers")
  }

  result <- sqrt(x)
  return(result)
}
# Example
print(paste("The square root of 25 is:", sqrt_num(25)))
## [1] "The square root of 25 is: 5"
print(paste("The square root of 49 is:", sqrt_num(49)))
## [1] "The square root of 49 is: 7"
print(paste("The square root of -4 is:", sqrt_num(-4)))
## [1] "The square root of -4 is: Error: square root of a negative number is not possible under real numbers"

HW5: (a) use of sapply(), lappy()

# Handling missing values
# Use of sapply(), lapply() to select numeric columns from dataframe
clean_data <- merge_pop_co2
numeric_cols <- sapply(clean_data, is.numeric)
clean_data[numeric_cols] <- lapply(
  clean_data1[numeric_cols],
  function(x) {
    x[is.na(x)] <- mean(x, na.rm = TRUE)
    return(x)
  }
)
## Error: object 'clean_data1' not found
colSums(is.na(clean_data))
##           Country.Territory                        CCA3 
##                           0                           0 
##                        Rank                     Capital 
##                           0                           0 
##                   Continent            X2022.Population 
##                           0                           0 
##            X2020.Population            X2015.Population 
##                           0                           0 
##            X2010.Population            X2000.Population 
##                           0                           0 
##            X1990.Population            X1980.Population 
##                           0                           0 
##            X1970.Population                  Area..km.. 
##                           0                           0 
##           Density..per.km..                 Growth.Rate 
##                           0                           0 
## World.Population.Percentage                      Region 
##                           0                           0 
##              Indicator.Name                       X1990 
##                           0                          22 
##                       X1991                       X1992 
##                          21                          19 
##                       X1993                       X1994 
##                          19                          19 
##                       X1995                       X1996 
##                          18                          18 
##                       X1997                       X1998 
##                          18                          19 
##                       X1999                       X2000 
##                          19                          18 
##                       X2001                       X2002 
##                          18                          17 
##                       X2003                       X2004 
##                          17                          17 
##                       X2005                       X2006 
##                          17                          17 
##                       X2007                       X2008 
##                          17                          17 
##                       X2009                       X2010 
##                          17                          17 
##                       X2011                       X2012 
##                          17                          17 
##                       X2013                       X2014 
##                          17                          17 
##                       X2015                       X2016 
##                          17                          17 
##                       X2017                       X2018 
##                          17                          17 
##                       X2019                     X2019.1 
##                          17                          17

Detecting outliers by boxplot and use sapply() in methods of handling outliers.

boxplot(World_population[sapply(World_population, is.numeric)],
        main = "Before Outlier Treatment")
## Warning in x[floor(d)] + x[ceiling(d)]: NAs produced by integer overflow
## Warning in x[floor(d)] + x[ceiling(d)]: NAs produced by integer overflow
## Warning in x[floor(d)] + x[ceiling(d)]: NAs produced by integer overflow
## Warning in x[floor(d)] + x[ceiling(d)]: NAs produced by integer overflow
## Warning in x[floor(d)] + x[ceiling(d)]: NAs produced by integer overflow
## Warning in x[floor(d)] + x[ceiling(d)]: NAs produced by integer overflow

Method 1: Using Empirical rule

clean_pop <- World_population
numeric_cols <- sapply(clean_pop, is.numeric)
clean_pop[numeric_cols] <- lapply(
  clean_pop[numeric_cols],
  function(x) {
    mean_x <- mean(x, na.rm = TRUE)
    sd_x <- sd(x, na.rm = TRUE)
    lower_bound <- mean_x - 3 * sd_x
    upper_bound <- mean_x + 3 * sd_x
    median_x <- median(x, na.rm = TRUE)
    x[x < lower_bound | x > upper_bound] <- median_x
    return(x)
  }
)
# Visualize using boxplot
boxplot(clean_pop[sapply(clean_pop, is.numeric)],
        main = "After Empirical Rule Treatment")

#### Method 2: Winsorization (better in many projects)

clean_pop3 <- World_population
numeric_cols <- sapply(clean_pop3, is.numeric)
clean_pop3[numeric_cols] <- lapply(
  clean_pop3[numeric_cols],
  function(x) {
    Q1 <- quantile(x, 0.25, na.rm = TRUE)
    Q3 <- quantile(x, 0.75, na.rm = TRUE)
    IQR_value <- IQR(x, na.rm = TRUE)
    lower_bound <- Q1 - 1.5 * IQR_value
    upper_bound <- Q3 + 1.5 * IQR_value
    x[x < lower_bound] <- lower_bound
    x[x > upper_bound] <- upper_bound
    return(x)
  }
)
# Visualize using boxplot
boxplot(clean_pop3[sapply(clean_pop3, is.numeric)],
        las = 2,
        col = "blue",
        main = "BoxPlot after Winsorization rule")

(b) Use of vapply(), mapply() to produce vectors, matrices, and arrays as output

#  VECTOR OUTPUT
# Each call returns length-1 numeric → combines to vector
vec <- vapply(1:6, function(x) x^2, FUN.VALUE = numeric(1))
class(vec)  
## [1] "numeric"
length(vec) 
## [1] 6
print(vec)  
## [1]  1  4  9 16 25 36
#  MATRIX OUTPUT
# Each call returns length-3 numeric → stacks as columns
matr <- vapply(1:3, function(x) c(x, x^2, x^3), FUN.VALUE = numeric(3))
class(matr)  
## [1] "matrix" "array"
dim(matr)    
## [1] 3 3
print(matr)
##      [,1] [,2] [,3]
## [1,]    1    2    3
## [2,]    1    4    9
## [3,]    1    8   27
#  ARRAY OUTPUT (3D)
# Each call returns a 2x2 matrix → vapply appends a new dimension
array <- vapply(1:2, function(k) array(k, dim = c(2, 2)), 
              FUN.VALUE = array(0, dim = c(2, 2)))
class(array)  
## [1] "array"
dim(array)    
## [1] 2 2 2
print(array)
## , , 1
## 
##      [,1] [,2]
## [1,]    1    1
## [2,]    1    1
## 
## , , 2
## 
##      [,1] [,2]
## [1,]    2    2
## [2,]    2    2

use mapply()

#  VECTOR OUTPUT 
# Each iteration returns a single value → simplified to numeric vector
vec <- mapply(function(x, y) x + y, 1:5, 10:14)
class(vec) 
## [1] "integer"
length(vec) 
## [1] 5
print(vec) 
## [1] 11 13 15 17 19
#  MATRIX OUTPUT
# Each iteration returns a vector of length > 1
# mapply() binds results as COLUMNS by default
mat <- mapply(function(a, b) c(a, b, a * b), 1:4, 2:5)
class(mat)  
## [1] "matrix" "array"
dim(mat)   
## [1] 3 4
print(mat)
##      [,1] [,2] [,3] [,4]
## [1,]    1    2    3    4
## [2,]    2    3    4    5
## [3,]    2    6   12   20
#  ARRAY OUTPUT (3D+)
# Each iteration returns a matrix/array → use SIMPLIFY = "array"
arr <- mapply(
  FUN = function(k) array(k, dim = c(2, 2)),
  1:3,
  SIMPLIFY = "array"
)
class(arr) 
## [1] "array"
dim(arr)    
## [1] 2 2 3
print(arr)
## , , 1
## 
##      [,1] [,2]
## [1,]    1    1
## [2,]    1    1
## 
## , , 2
## 
##      [,1] [,2]
## [1,]    2    2
## [2,]    2    2
## 
## , , 3
## 
##      [,1] [,2]
## [1,]    3    3
## [2,]    3    3

HW 6: Make functions tha calculates summary statistics and apply it to a variable to that it works.

A. Let us start with a build in functions in R and apply to world_population dataset on 2022 population.

 summary_stats<- function(x) {
  
  x <- x[!is.na(x)]
  
  data.frame(
    mean = mean(x),
    median = median(x),
    sd = sd(x),
    Xmin = min(x),
    Q1 = quantile(x, 0.25),
    Q3 = quantile(x, 0.75),
    Xmax = max(x)
  )
}
summary_stats(World_population$X2022.Population)

Manual function that calculates summary statistics

B. I want to create a manual function with body statements to calculates the summary statistics. Then after apply it to dataset selected column to verify how it works. I want to apply to the worl_population dataset by selection Africa vs Europe.

my_summary_stats <- function(x) {
  
  x <- na.omit(x)
  n <- length(x)
  
  # Mean
  mean_val <- sum(x) / n
  
  # Sort data
  x_sorted <- sort(x)
  
  # Median (Q2)
  if (n %% 2 == 1) {
    median_val <- x_sorted[(n + 1) / 2]
  } else {
    median_val <- (x_sorted[n/2] + x_sorted[n/2 + 1]) / 2
  }
  
  # Min and Max
  min_val <- x_sorted[1]
  max_val <- x_sorted[n]
  
  # Variance and SD
  variance <- sum((x - mean_val)^2) / (n - 1)
  sd_val <- sqrt(variance)
  
  # Quartiles
  Q1 <- quantile(x_sorted, 0.25, type = 7)
  Q3 <- quantile(x_sorted, 0.75, type = 7)
  
  # Interquartile Range
  IQR_val <- Q3 - Q1
  
 # Return as table (data frame)
  result <- data.frame(
    Statistic = c("Mean", "Median", "Q1", "Q3", "IQR", "Min", "Max", "SD"),
    Value = c(mean_val, median_val, Q1, Q3, IQR_val, min_val, max_val, sd_val)
  )
  
  return(result)
}
my_summary_stats(World_population$X2022.Population)

Two-sample t-test

It compares the means of two groups: The null hypothesis is stated that both means are equal. While alternative hypothesis stated that means are different. t = (mean1-mean2)/(sqrt(var1/n1+var2/n2))

my_t_test <- function(x, y) {
  # Remove missing values
  x <- na.omit(x)
  y <- na.omit(y)
  
  # Sample sizes
  n1 <- length(x)
  n2 <- length(y)
  
  # Means
  mean1 <- mean(x)
  mean2 <- mean(y)
  
  # Variances
  var1 <- var(x)
  var2 <- var(y)
  
  # Standard error
  se <- sqrt(var1/n1 + var2/n2)
  
  # t-statistic
  t_stat <- (mean1 - mean2) / se
  
  # Degrees of freedom (Welch approximation)
  df <- (var1/n1 + var2/n2)^2 /
        ((var1^2)/(n1^2*(n1-1)) + (var2^2)/(n2^2*(n2-1)))
  
  # p-value
  p_value <- 2 * pt(-abs(t_stat), df)
  
  # Return results
  result <- data.frame(
    Statistic = c("Mean_X", "Mean_Y", "t_value", "df", "p_value"),
    Value = c(mean1, mean2, t_stat, df, p_value)
  )
  
  return(result)
}
my_t_test(
  World_population$X2022.Population[World_population$Continent == "Africa"],
  World_population$X2022.Population[World_population$Continent == "Europe"]
)