All homework in Big Data Analytics

#Homework1: Import data from statistical package and from data management system

library(readr)
liste_of_students_okk <- read_csv("~/MASTERS NOTES AND ASSIGNEMENT/R PROGRAMING/PRACTICAL/INTRO/intro/liste of students okk.xlsx")
## Multiple files in zip: reading '[Content_Types].xml'
## Rows: 1 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(liste_of_students_okk)
head(liste_of_students_okk)
## # A tibble: 1 × 1
##   `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>`                     
##   <chr>                                                                         
## 1 "<Types xmlns=\"http://schemas.openxmlformats.org/package/2006/content-types\…
summary(liste_of_students_okk)
##  <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
##  Length   :   1                                         
##  N.unique :   1                                         
##  N.blank  :   0                                         
##  Min.nchar:1227                                         
##  Max.nchar:1227
# Import second dataset
library(readr)
covid_19_clean_complete <- read_csv("~/MASTERS NOTES AND ASSIGNEMENT/R PROGRAMING/PRACTICAL/INTRO/intro/covid_19_clean_complete.csv")
## Rows: 49068 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): Province/State, Country/Region, WHO Region
## dbl  (6): Lat, Long, Confirmed, Deaths, Recovered, Active
## date (1): Date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(covid_19_clean_complete)
head(covid_19_clean_complete)
## # A tibble: 6 × 10
##   `Province/State` `Country/Region`      Lat   Long Date       Confirmed Deaths
##   <chr>            <chr>               <dbl>  <dbl> <date>         <dbl>  <dbl>
## 1 <NA>             Afghanistan          33.9  67.7  2020-01-22         0      0
## 2 <NA>             Albania              41.2  20.2  2020-01-22         0      0
## 3 <NA>             Algeria              28.0   1.66 2020-01-22         0      0
## 4 <NA>             Andorra              42.5   1.52 2020-01-22         0      0
## 5 <NA>             Angola              -11.2  17.9  2020-01-22         0      0
## 6 <NA>             Antigua and Barbuda  17.1 -61.8  2020-01-22         0      0
## # ℹ 3 more variables: Recovered <dbl>, Active <dbl>, `WHO Region` <chr>
summary(covid_19_clean_complete)
##    Province/State    Country/Region       Lat               Long        
##  Length   :49068   Length   :49068   Min.   :-51.796   Min.   :-135.00  
##  N.unique :   78   N.unique :  187   1st Qu.:  7.873   1st Qu.: -15.31  
##  N.blank  :    0   N.blank  :    0   Median : 23.634   Median :  21.75  
##  Min.nchar:    5   Min.nchar:    2   Mean   : 21.434   Mean   :  23.53  
##  Max.nchar:   28   Max.nchar:   32   3rd Qu.: 41.204   3rd Qu.:  80.77  
##  NAs      :34404                     Max.   : 71.707   Max.   : 178.06  
##       Date              Confirmed           Deaths           Recovered      
##  Min.   :2020-01-22   Min.   :      0   Min.   :     0.0   Min.   :      0  
##  1st Qu.:2020-03-08   1st Qu.:      4   1st Qu.:     0.0   1st Qu.:      0  
##  Median :2020-04-24   Median :    168   Median :     2.0   Median :     29  
##  Mean   :2020-04-24   Mean   :  16885   Mean   :   884.2   Mean   :   7916  
##  3rd Qu.:2020-06-10   3rd Qu.:   1518   3rd Qu.:    30.0   3rd Qu.:    666  
##  Max.   :2020-07-27   Max.   :4290259   Max.   :148011.0   Max.   :1846641  
##      Active            WHO Region   
##  Min.   :    -14   Length   :49068  
##  1st Qu.:      0   N.unique :    6  
##  Median :     26   N.blank  :    0  
##  Mean   :   8085   Min.nchar:    6  
##  3rd Qu.:    606   Max.nchar:   21  
##  Max.   :2816444

Summary: This code imports the student dataset into R using the read_csv() function from the readr package. It then allows inspection of the data structure using functions like View(), head(), and summary().and also, This code imports the COVID-19 dataset into R from a CSV file using read_csv(). It then displays and summarizes the dataset to understand its structure and main characteristics.

# Homework2: Merge two datasets 
# Import first dataset
library(readr)
Life_Expectancy_Data <- read_csv("~/MASTERS NOTES AND ASSIGNEMENT/R PROGRAMING/PRACTICAL/INTRO/ASSIGNEMENT/archive (10)/Life Expectancy Data.csv")
## Rows: 2938 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): Country, Status
## dbl (20): Year, Life expectancy, Adult Mortality, infant deaths, Alcohol, pe...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(Life_Expectancy_Data)

library(readr)
X2020_2025 <- read_csv("~/MASTERS NOTES AND ASSIGNEMENT/R PROGRAMING/PRACTICAL/INTRO/ASSIGNEMENT/archive (11)/2020-2025.csv")
## Rows: 196 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Country
## dbl (6): 2020, 2021, 2022, 2023, 2024, 2025
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(X2020_2025)

colnames(Life_Expectancy_Data)
##  [1] "Country"                         "Year"                           
##  [3] "Status"                          "Life expectancy"                
##  [5] "Adult Mortality"                 "infant deaths"                  
##  [7] "Alcohol"                         "percentage expenditure"         
##  [9] "Hepatitis B"                     "Measles"                        
## [11] "BMI"                             "under-five deaths"              
## [13] "Polio"                           "Total expenditure"              
## [15] "Diphtheria"                      "HIV/AIDS"                       
## [17] "GDP"                             "Population"                     
## [19] "thinness  1-19 years"            "thinness 5-9 years"             
## [21] "Income composition of resources" "Schooling"
colnames(X2020_2025)
## [1] "Country" "2020"    "2021"    "2022"    "2023"    "2024"    "2025"
# Convert X2020_2025 to LONG format
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
X2020_2025_long <- X2020_2025 %>%
  pivot_longer(
    cols = -Country,
    names_to = "Year",
    values_to = "Value"
  )
# Fix Year type
X2020_2025_long$Year <- as.numeric(X2020_2025_long$Year)

# Lets merge 
merged_data <- Life_Expectancy_Data %>%
  inner_join(X2020_2025_long, by = c("Country", "Year"))

Explanation of the Merge Process

In this assignment, two datasets were imported into R: Life Expectancy Data and 2020–2025 data. The Life Expectancy dataset was already in a long format, containing variables such as Country and Year.

However, the 2020–2025 dataset was in a wide format, where each year (2020–2025) was stored as a separate column. To make both datasets compatible for merging, the second dataset was transformed into a long format using the pivot_longer() function from the tidyr package. This restructuring created two key columns: Year and Value.

The Year column was then converted to a numeric format to ensure consistency between both datasets.

Finally, the two datasets were merged using an inner join based on the common variables Country and Year, resulting in a single dataset that combines life expectancy information with the values from the 2020–2025 dataset.

# Homework3: Group by () and pipe %>%
Life_Expectancy_Data %>%
  select(Country, Year, `Life expectancy`)
## # A tibble: 2,938 × 3
##    Country      Year `Life expectancy`
##    <chr>       <dbl>             <dbl>
##  1 Afghanistan  2015              65  
##  2 Afghanistan  2014              59.9
##  3 Afghanistan  2013              59.9
##  4 Afghanistan  2012              59.5
##  5 Afghanistan  2011              59.2
##  6 Afghanistan  2010              58.8
##  7 Afghanistan  2009              58.6
##  8 Afghanistan  2008              58.1
##  9 Afghanistan  2007              57.5
## 10 Afghanistan  2006              57.3
## # ℹ 2,928 more rows
library(dplyr)

Life_Expectancy_Data %>%
  group_by(Country) %>%
  summarise(Average_Life_Expectancy = mean(`Life expectancy`, na.rm = TRUE))
## # A tibble: 193 × 2
##    Country             Average_Life_Expectancy
##    <chr>                                 <dbl>
##  1 Afghanistan                            58.2
##  2 Albania                                75.2
##  3 Algeria                                73.6
##  4 Angola                                 49.0
##  5 Antigua and Barbuda                    75.1
##  6 Argentina                              75.2
##  7 Armenia                                73.4
##  8 Australia                              81.8
##  9 Austria                                81.5
## 10 Azerbaijan                             70.7
## # ℹ 183 more rows
merged_data %>%
  group_by(Country) %>%
  summarise(Average_Value = mean(Value, na.rm = TRUE))
## # A tibble: 0 × 2
## # ℹ 2 variables: Country <chr>, Average_Value <dbl>
Life_Expectancy_Data %>%
  group_by(Country) %>%
  summarise(Number_of_Records = n())
## # A tibble: 193 × 2
##    Country             Number_of_Records
##    <chr>                           <int>
##  1 Afghanistan                        16
##  2 Albania                            16
##  3 Algeria                            16
##  4 Angola                             16
##  5 Antigua and Barbuda                16
##  6 Argentina                          16
##  7 Armenia                            16
##  8 Australia                          16
##  9 Austria                            16
## 10 Azerbaijan                         16
## # ℹ 183 more rows
# use select(),filter(),arrange(),rename(),mutate()
library(dplyr)

students <- data.frame(
  id = c(1, 2, 3, 4, 5),
  name = c("Amina", "Brian", "Claire", "David", "Eva"),
  age = c(19, 21, 20, 22, 19),
  score = c(78, 85, 92, 60, 88),
  city = c("Kigali", "Nairobi", "Kampala", "Kigali", "Nairobi")
)

#select()
students_select <- students %>%
  select(name, age, score)
#filter()
students_filter <- students %>%
  filter(score >= 80)
#arrange()
students_arranged <- students %>%
  arrange(desc(score))
#rename()
students_renamed <- students %>%
  rename(
    student_name = name,
    student_score = score
  )
# mutate()
students_mutated <- students %>%
  mutate(
    grade = ifelse(score >= 80, "A", "B"),
    score_plus_5 = score + 5
  )
# all
students_final <- students %>%
  filter(score >= 70) %>%
  select(name, age, score) %>%
  mutate(status = "passed") %>%
  arrange(desc(score))

Expalanation: The group_by() function in R is used to group data based on a specific variable such as Country or Year. The pipe operator %>% is used to connect multiple operations in a readable sequence. After grouping the data, the summarise() function is used to calculate summary statistics such as mean or count for each group. The data was analyzed in R using the dplyr package together with the pipe operator %>% to make the code easier to read and follow. The group_by() function was used to group the data by Country, and summarise() was used to calculate summary statistics such as the average life expectancy and the number of records for each country. These steps helped to understand differences between countries and summarize key patterns in both the original and merged datasets.

# Homework4: How to use trace() and recover()

my_function <- function(x, y) {
  result <- (x^2 + 2 * y)
  return(result)
}

# Add tracing
trace(
  "my_function",
  quote(cat("Inside my_function: x =", x, ", y =", y, "\n")),
  at = 1,
  print = FALSE
)
## [1] "my_function"
my_function(2, 3)
## Inside my_function: x = 2 , y = 3
## [1] 10
# Remove tracing afterward
untrace("my_function")

# Enable recover debugging on errors
options(error = recover)

# Example function with an error
my_function_debug <- function(x, y) {
  result <- (x^2 + 2 * y)
  stop("Test error")
  return(result)
}

my_function_debug(2, 3)
## Error in my_function_debug(2, 3): Test error

Explanation: The recover() function was used to debug errors in R by allowing interactive inspection of the environment where an error occurs. When an error is triggered, R pauses execution and provides options to explore the function stack. After testing, the error option was reset using options(error = NULL).

#Function for Summary Statistics
# Create summary statistics function
summary_stats <- function(x) {
  
  result <- list(
    Mean = mean(x),
    Median = median(x),
    SD = sd(x),
    Minimum = min(x),
    Maximum = max(x)
  )
  
  return(result)
}

# Example variable
scores <- c(12, 15, 18, 20, 22, 25)

# Apply function
summary_stats(scores)
## $Mean
## [1] 18.66667
## 
## $Median
## [1] 19
## 
## $SD
## [1] 4.718757
## 
## $Minimum
## [1] 12
## 
## $Maximum
## [1] 25
#Function for Two-Sample t-test
# Create two-sample t-test function
two_sample_ttest <- function(x, y) {
  
  result <- t.test(x, y)
  
  return(result)
}

# Example data
group1 <- c(12, 14, 15, 16, 18)
group2 <- c(20, 22, 24, 25, 27)

# Apply function
two_sample_ttest(group1, group2)
## 
##  Welch Two Sample t-test
## 
## data:  x and y
## t = -5.4832, df = 7.7297, p-value = 0.0006587
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -12.238958  -4.961042
## sample estimates:
## mean of x mean of y 
##      15.0      23.6

Explanation: The summary_stats() function calculates basic descriptive statistics such as the mean, median, standard deviation, minimum, and maximum values of a dataset. The two_sample_ttest() function performs a two-sample t-test to compare the means of two groups and determine whether they are significantly different. In the examples above, the functions are applied to sample numeric data to demonstrate how user-defined functions can simplify statistical analysis in R.

#sapply()
# Create a list
numbers <- list(a = 1:5, b = 6:10, c = 11:15)

# Find mean of each list element
sapply(numbers, mean)
##  a  b  c 
##  3  8 13
#vapply()
# Create a list
numbers <- list(a = 1:5, b = 6:10, c = 11:15)

# Apply mean function
vapply(numbers, mean, numeric(1))
##  a  b  c 
##  3  8 13
#mapply()
# Two vectors
x <- 1:5
y <- 6:10

# Add corresponding values
mapply(function(a, b) a + b, x, y)
## [1]  7  9 11 13 15
#Map()
# Two vectors
x <- 1:5
y <- 6:10

# Multiply values
Map(function(a, b) a * b, x, y)
## [[1]]
## [1] 6
## 
## [[2]]
## [1] 14
## 
## [[3]]
## [1] 24
## 
## [[4]]
## [1] 36
## 
## [[5]]
## [1] 50
#split()
# Student scores
scores <- c(80, 75, 90, 85, 70, 95)

# Groups
group <- c("A", "A", "B", "B", "C", "C")

# Split scores by group
split(scores, group)
## $A
## [1] 80 75
## 
## $B
## [1] 90 85
## 
## $C
## [1] 70 95
#tapply()
# Student scores
scores <- c(80, 75, 90, 85, 70, 95)

# Groups
group <- c("A", "A", "B", "B", "C", "C")

# Find mean score for each group
tapply(scores, group, mean)
##    A    B    C 
## 77.5 87.5 82.5

sapply() applies the mean() function to each element in the list and returns the results as a vector. vapply() applies the mean() function and ensures the output is numeric. mapply() adds elements from two vectors position by position. Map() applies a function to multiple inputs and returns the result as a list. split() separates the scores according to their groups. tapply() calculates the mean score for each group separately. In summarry: sapply() applies a function to a list or vector and returns a simplified output like a vector or matrix. vapply() is similar to sapply() but requires specifying the output type for safer results. mapply() applies a function to multiple vectors or lists at the same time. Map() works like mapply() but returns the output as a list. split() divides data into groups based on a factor or category. tapply() applies a function to grouped data and summarizes the results.