The haven package in R is designed to import and export foreign statistical data formats from SAS,SPSS, and Stata files into R. #### install packages and libraries
#install.packages("haven")
#install.packages("readxl")
#install.packages("readr")
#install.packages("sas7bdat")
library(haven)
library(readxl)
library(readr)
library(sas7bdat)
#Import SPSS (.sav) file in R using haven library
data_spss <- read_sav("C:/Users/LENOVO/OneDrive/Desktop/spss_data.sav")
head(data_spss)
data_excell <- read_excel("C:/Users/LENOVO/OneDrive/Desktop/data_excel.xlsx")
head(data_excell)
data_csv <- read.csv("C:/Users/LENOVO/OneDrive/Desktop/data3.csv")
head(data_csv)
The merge() function in R is used to combine two datasets using one or more common columns. They are different types of merging dataframe like (merge by default, left merge, right merge, merge all). Sometimes the data frame might have different dimensions or matching columns are named in different ways.
Let me start by merge dateframes having the same dimensions ### Dataframe creation or loading datasets in R
#First dataframe
data_1 <- data.frame(
ID = c(1, 2, 3, 4, 5),
Names = c("Alice", "John", "Mary","Simon", "Caleb"),
Gender = c("F","M","F","M","M"),
Age = c(22,25,19,30,28),
Country =c("Rwanda", "Kenya","Nigeria","Rwanda", "Tanzania"),
Major =c("Statistics","IT","Maths","Busines","Medical")
)
#Second dataframe
data_2<-data.frame(
ID=c(1,2,3,4,5),
Names = c("Alice", "John", "Mary","Simon", "Caleb"),
Gender = c("F","M","F","M","M"),
Reserch_grade = c(18,15,10,16,13),
Python_grade = c(14,16,11,18,15),
Alg_grade =c(16,14,17,12,15),
math_grade = c(15,12,12,17,16),
R_grade =c(15,18,14,16,13)
)
# merge two dataframes by one common factor ("ID)
merged_1<-merge(data_1,data_2,by="ID")
head(merged_1, 3)
# merge two dataframes by one common factor ("ID)
merged_2<-merge(data_1,data_2, by = c("ID", "Names"))
head(merged_2,3)
# merge two dataframes by one common factor ("ID)
merged_3<-merge(data_1,data_2, by = c("ID", "Names","Gender"))
head(merged_3,3)
# Loading datasets from kaggle website
# 1. World population dataset
World_population <-read.csv("C:/Users/LENOVO/OneDrive/Desktop/Datasets/world_population.csv")
head(World_population,3)
#2. CO2_Emission dataset
co2_Emission <-read.csv("C:/Users/LENOVO/OneDrive/Desktop/Datasets/co2_Emission.csv")
head(co2_Emission,3)
cat("\n======== variable names in world population dataset=====\n")
##
## ======== variable names in world population dataset=====
variable.names(World_population)
## [1] "Rank" "CCA3"
## [3] "Country.Territory" "Capital"
## [5] "Continent" "X2022.Population"
## [7] "X2020.Population" "X2015.Population"
## [9] "X2010.Population" "X2000.Population"
## [11] "X1990.Population" "X1980.Population"
## [13] "X1970.Population" "Area..km.."
## [15] "Density..per.km.." "Growth.Rate"
## [17] "World.Population.Percentage"
cat("\n======== variable names in co2_Emission dataset=====\n")
##
## ======== variable names in co2_Emission dataset=====
variable.names(co2_Emission)
## [1] "Country.Name" "country_code" "Region" "Indicator.Name"
## [5] "X1990" "X1991" "X1992" "X1993"
## [9] "X1994" "X1995" "X1996" "X1997"
## [13] "X1998" "X1999" "X2000" "X2001"
## [17] "X2002" "X2003" "X2004" "X2005"
## [21] "X2006" "X2007" "X2008" "X2009"
## [25] "X2010" "X2011" "X2012" "X2013"
## [29] "X2014" "X2015" "X2016" "X2017"
## [33] "X2018" "X2019" "X2019.1"
cat("\n === Checking the dimension of world_population dataset===\n")
##
## === Checking the dimension of world_population dataset===
dim(World_population)
## [1] 234 17
cat("\n=== Checking the dimension of co2_Emission dataset===\n ")
##
## === Checking the dimension of co2_Emission dataset===
##
dim(co2_Emission)
## [1] 215 35
In the two datasets the factor country is written in different ways like Country.Territory in world_population and Country.Name in co2_Emission.
First, let me use merge() function by default that keeps only matching rows
# Using default merge: Keeps only matching rows
# Merge by two common factors
merge_pop_co2 <- merge(
World_population,
co2_Emission,
by.x = c("Country.Territory", "CCA3"),
by.y = c("Country.Name", "country_code")
)
dim(merge_pop_co2)
## [1] 185 50
# Count the total missing values in each column for the merged dataset.
colSums(is.na(merge_pop_co2))
## Country.Territory CCA3
## 0 0
## Rank Capital
## 0 0
## Continent X2022.Population
## 0 0
## X2020.Population X2015.Population
## 0 0
## X2010.Population X2000.Population
## 0 0
## X1990.Population X1980.Population
## 0 0
## X1970.Population Area..km..
## 0 0
## Density..per.km.. Growth.Rate
## 0 0
## World.Population.Percentage Region
## 0 0
## Indicator.Name X1990
## 0 22
## X1991 X1992
## 21 19
## X1993 X1994
## 19 19
## X1995 X1996
## 18 18
## X1997 X1998
## 18 19
## X1999 X2000
## 19 18
## X2001 X2002
## 18 17
## X2003 X2004
## 17 17
## X2005 X2006
## 17 17
## X2007 X2008
## 17 17
## X2009 X2010
## 17 17
## X2011 X2012
## 17 17
## X2013 X2014
## 17 17
## X2015 X2016
## 17 17
## X2017 X2018
## 17 17
## X2019 X2019.1
## 17 17
Handling the missing values by replacing with mean column value
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Example 1:
data("mtcars")
# Without piping
mean(mtcars$mpg)
## [1] 20.09062
#With pipe
mtcars %>%
summarise(mean_mpg = mean(mpg))
This is very common in statistics since it preverses datasets size and reduces bias in the dataset.
# Example 2: Handling missing values by replacing NA by mean.
library(dplyr)
clean_data <- merge_pop_co2 %>%
mutate(
across(
where(is.numeric),
~ ifelse(is.na(.),
mean(., na.rm = TRUE),
.)
)
)
#is.na(clean_data1)
colSums(is.na(clean_data))
## Country.Territory CCA3
## 0 0
## Rank Capital
## 0 0
## Continent X2022.Population
## 0 0
## X2020.Population X2015.Population
## 0 0
## X2010.Population X2000.Population
## 0 0
## X1990.Population X1980.Population
## 0 0
## X1970.Population Area..km..
## 0 0
## Density..per.km.. Growth.Rate
## 0 0
## World.Population.Percentage Region
## 0 0
## Indicator.Name X1990
## 0 0
## X1991 X1992
## 0 0
## X1993 X1994
## 0 0
## X1995 X1996
## 0 0
## X1997 X1998
## 0 0
## X1999 X2000
## 0 0
## X2001 X2002
## 0 0
## X2003 X2004
## 0 0
## X2005 X2006
## 0 0
## X2007 X2008
## 0 0
## X2009 X2010
## 0 0
## X2011 X2012
## 0 0
## X2013 X2014
## 0 0
## X2015 X2016
## 0 0
## X2017 X2018
## 0 0
## X2019 X2019.1
## 0 0
What is group_by()? It splits your data into groups so operations are done within each group. Here, we want to calculate the total population by continent
#Example 1: Total population by continent
clean_data %>%
group_by(Continent) %>%
summarise(
total_population = sum(X2022.Population, na.rm = TRUE)
)
Use of select(), filter(), rename(), arrange(), group_by(), and %>%
library(dplyr)
result <- World_population %>%
# 1. Rename columns (example)
rename(
Country = Country.Territory,
Pop_2022 = X2022.Population
) %>%
# 2. Select only useful columns
select(Country, CCA3, Continent, Pop_2022) %>%
# 3. Filter (example: keep large population countries)
filter(Pop_2022 > 50000000) %>%
# 4. Create new variable
mutate(
Pop_Millions = Pop_2022 / 1000000
) %>%
# 5. Group by continent
group_by(Continent) %>%
# 6. Summarise (important step after group_by)
summarise(
avg_population = mean(Pop_Millions, na.rm = TRUE),
total_population = sum(Pop_2022, na.rm = TRUE),
countries = n()
) %>%
# 7. Arrange results
arrange(desc(total_population))
# View result
result
divide_num <- function(a, b) {
results <- (a / b)+5 # R returns Inf here if b is 0
if (is.infinite(results)) {
return("Error: divide a number by zero is not possible")
} else {
return(results)
}
}
divide_num(3,0)
## [1] "Error: divide a number by zero is not possible"
The trace() function helps you see what happens inside a function while it is running. We pply trace() to watch execution of the function created.
#trace(what, tracer)
trace(
divide_num,
tracer = quote(paste("Function divide_num is running: Please wait!\n"))
)
## [1] "divide_num"
divide_num(10, 2)
## Tracing divide_num(10, 2) on entry
## [1] 10
divide_num <- function(a,b) {
if (b == 0) {
stop("Error: divide a number by zero is not possible")
}
results <- (a/b)
return(results)
}
# Activate recover mode
options(error = recover)
# Run the function
divide_num(3,2)
## [1] 1.5
# Define the function
divide_num <- function(a,b) {
if (b == 0) {
stop("Error: divide a number by zero is not possible")
}
results <- (a/b+2)
return(results)
}
# Add tracing to the function
trace(
divide_num,
tracer = quote(cat("Tracing: a =", a, " b =", b, "\n"))
)
## [1] "divide_num"
# Activate recover mode
options(error = recover)
# Run the function
divide_num(3,0)
## Tracing divide_num(3, 0) on entry
## Tracing: a = 3 b = 0
## Error in divide_num(3, 0): Error: divide a number by zero is not possible
# Step 1: Creating a function
sqrt_num <- function(x) {
if (x < 0) {
return("Error: square root of a negative number is not possible under real numbers")
}
result <- sqrt(x)
return(result)
}
# Example
print(paste("The square root of 25 is:", sqrt_num(25)))
## [1] "The square root of 25 is: 5"
print(paste("The square root of 49 is:", sqrt_num(49)))
## [1] "The square root of 49 is: 7"
print(paste("The square root of -4 is:", sqrt_num(-4)))
## [1] "The square root of -4 is: Error: square root of a negative number is not possible under real numbers"
# Handling missing values
# Use of sapply(), lapply() to select numeric columns from dataframe
clean_data <- merge_pop_co2
numeric_cols <- sapply(clean_data, is.numeric)
clean_data[numeric_cols] <- lapply(
clean_data1[numeric_cols],
function(x) {
x[is.na(x)] <- mean(x, na.rm = TRUE)
return(x)
}
)
## Error: object 'clean_data1' not found
colSums(is.na(clean_data))
## Country.Territory CCA3
## 0 0
## Rank Capital
## 0 0
## Continent X2022.Population
## 0 0
## X2020.Population X2015.Population
## 0 0
## X2010.Population X2000.Population
## 0 0
## X1990.Population X1980.Population
## 0 0
## X1970.Population Area..km..
## 0 0
## Density..per.km.. Growth.Rate
## 0 0
## World.Population.Percentage Region
## 0 0
## Indicator.Name X1990
## 0 22
## X1991 X1992
## 21 19
## X1993 X1994
## 19 19
## X1995 X1996
## 18 18
## X1997 X1998
## 18 19
## X1999 X2000
## 19 18
## X2001 X2002
## 18 17
## X2003 X2004
## 17 17
## X2005 X2006
## 17 17
## X2007 X2008
## 17 17
## X2009 X2010
## 17 17
## X2011 X2012
## 17 17
## X2013 X2014
## 17 17
## X2015 X2016
## 17 17
## X2017 X2018
## 17 17
## X2019 X2019.1
## 17 17
boxplot(World_population[sapply(World_population, is.numeric)],
main = "Before Outlier Treatment")
## Warning in x[floor(d)] + x[ceiling(d)]: NAs produced by integer overflow
## Warning in x[floor(d)] + x[ceiling(d)]: NAs produced by integer overflow
## Warning in x[floor(d)] + x[ceiling(d)]: NAs produced by integer overflow
## Warning in x[floor(d)] + x[ceiling(d)]: NAs produced by integer overflow
## Warning in x[floor(d)] + x[ceiling(d)]: NAs produced by integer overflow
## Warning in x[floor(d)] + x[ceiling(d)]: NAs produced by integer overflow
clean_pop <- World_population
numeric_cols <- sapply(clean_pop, is.numeric)
clean_pop[numeric_cols] <- lapply(
clean_pop[numeric_cols],
function(x) {
mean_x <- mean(x, na.rm = TRUE)
sd_x <- sd(x, na.rm = TRUE)
lower_bound <- mean_x - 3 * sd_x
upper_bound <- mean_x + 3 * sd_x
median_x <- median(x, na.rm = TRUE)
x[x < lower_bound | x > upper_bound] <- median_x
return(x)
}
)
# Visualize using boxplot
boxplot(clean_pop[sapply(clean_pop, is.numeric)],
main = "After Empirical Rule Treatment")
#### Method 2: Winsorization (better in many projects)
clean_pop3 <- World_population
numeric_cols <- sapply(clean_pop3, is.numeric)
clean_pop3[numeric_cols] <- lapply(
clean_pop3[numeric_cols],
function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR_value <- IQR(x, na.rm = TRUE)
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
x[x < lower_bound] <- lower_bound
x[x > upper_bound] <- upper_bound
return(x)
}
)
# Visualize using boxplot
boxplot(clean_pop3[sapply(clean_pop3, is.numeric)],
las = 2,
col = "blue",
main = "BoxPlot after Winsorization rule")
# VECTOR OUTPUT
# Each call returns length-1 numeric → combines to vector
vec <- vapply(1:6, function(x) x^2, FUN.VALUE = numeric(1))
class(vec)
## [1] "numeric"
length(vec)
## [1] 6
print(vec)
## [1] 1 4 9 16 25 36
# MATRIX OUTPUT
# Each call returns length-3 numeric → stacks as columns
matr <- vapply(1:3, function(x) c(x, x^2, x^3), FUN.VALUE = numeric(3))
class(matr)
## [1] "matrix" "array"
dim(matr)
## [1] 3 3
print(matr)
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 1 4 9
## [3,] 1 8 27
# ARRAY OUTPUT (3D)
# Each call returns a 2x2 matrix → vapply appends a new dimension
array <- vapply(1:2, function(k) array(k, dim = c(2, 2)),
FUN.VALUE = array(0, dim = c(2, 2)))
class(array)
## [1] "array"
dim(array)
## [1] 2 2 2
print(array)
## , , 1
##
## [,1] [,2]
## [1,] 1 1
## [2,] 1 1
##
## , , 2
##
## [,1] [,2]
## [1,] 2 2
## [2,] 2 2
# VECTOR OUTPUT
# Each iteration returns a single value → simplified to numeric vector
vec <- mapply(function(x, y) x + y, 1:5, 10:14)
class(vec)
## [1] "integer"
length(vec)
## [1] 5
print(vec)
## [1] 11 13 15 17 19
# MATRIX OUTPUT
# Each iteration returns a vector of length > 1
# mapply() binds results as COLUMNS by default
mat <- mapply(function(a, b) c(a, b, a * b), 1:4, 2:5)
class(mat)
## [1] "matrix" "array"
dim(mat)
## [1] 3 4
print(mat)
## [,1] [,2] [,3] [,4]
## [1,] 1 2 3 4
## [2,] 2 3 4 5
## [3,] 2 6 12 20
# ARRAY OUTPUT (3D+)
# Each iteration returns a matrix/array → use SIMPLIFY = "array"
arr <- mapply(
FUN = function(k) array(k, dim = c(2, 2)),
1:3,
SIMPLIFY = "array"
)
class(arr)
## [1] "array"
dim(arr)
## [1] 2 2 3
print(arr)
## , , 1
##
## [,1] [,2]
## [1,] 1 1
## [2,] 1 1
##
## , , 2
##
## [,1] [,2]
## [1,] 2 2
## [2,] 2 2
##
## , , 3
##
## [,1] [,2]
## [1,] 3 3
## [2,] 3 3
A. Let us start with a build in functions in R and apply to world_population dataset on 2022 population.
summary_stats<- function(x) {
x <- x[!is.na(x)]
data.frame(
mean = mean(x),
median = median(x),
sd = sd(x),
Xmin = min(x),
Q1 = quantile(x, 0.25),
Q3 = quantile(x, 0.75),
Xmax = max(x)
)
}
summary_stats(World_population$X2022.Population)
B. I want to create a manual function with body statements to calculates the summary statistics. Then after apply it to dataset selected column to verify how it works. I want to apply to the worl_population dataset by selection Africa vs Europe.
my_summary_stats <- function(x) {
x <- na.omit(x)
n <- length(x)
# Mean
mean_val <- sum(x) / n
# Sort data
x_sorted <- sort(x)
# Median (Q2)
if (n %% 2 == 1) {
median_val <- x_sorted[(n + 1) / 2]
} else {
median_val <- (x_sorted[n/2] + x_sorted[n/2 + 1]) / 2
}
# Min and Max
min_val <- x_sorted[1]
max_val <- x_sorted[n]
# Variance and SD
variance <- sum((x - mean_val)^2) / (n - 1)
sd_val <- sqrt(variance)
# Quartiles
Q1 <- quantile(x_sorted, 0.25, type = 7)
Q3 <- quantile(x_sorted, 0.75, type = 7)
# Interquartile Range
IQR_val <- Q3 - Q1
# Return as table (data frame)
result <- data.frame(
Statistic = c("Mean", "Median", "Q1", "Q3", "IQR", "Min", "Max", "SD"),
Value = c(mean_val, median_val, Q1, Q3, IQR_val, min_val, max_val, sd_val)
)
return(result)
}
my_summary_stats(World_population$X2022.Population)
It compares the means of two groups: The null hypothesis is stated that both means are equal. While alternative hypothesis stated that means are different. t = (mean1-mean2)/(sqrt(var1/n1+var2/n2))
my_t_test <- function(x, y) {
# Remove missing values
x <- na.omit(x)
y <- na.omit(y)
# Sample sizes
n1 <- length(x)
n2 <- length(y)
# Means
mean1 <- mean(x)
mean2 <- mean(y)
# Variances
var1 <- var(x)
var2 <- var(y)
# Standard error
se <- sqrt(var1/n1 + var2/n2)
# t-statistic
t_stat <- (mean1 - mean2) / se
# Degrees of freedom (Welch approximation)
df <- (var1/n1 + var2/n2)^2 /
((var1^2)/(n1^2*(n1-1)) + (var2^2)/(n2^2*(n2-1)))
# p-value
p_value <- 2 * pt(-abs(t_stat), df)
# Return results
result <- data.frame(
Statistic = c("Mean_X", "Mean_Y", "t_value", "df", "p_value"),
Value = c(mean1, mean2, t_stat, df, p_value)
)
return(result)
}
my_t_test(
World_population$X2022.Population[World_population$Continent == "Africa"],
World_population$X2022.Population[World_population$Continent == "Europe"]
)