Assignment 1: Importing datasets from different sources

# Importing a csv file and store it in a variable data
students <- read.csv("C:/Users/eniyomufasha/Downloads/student_data.csv")
View(students)
str(students)
## 'data.frame':    7 obs. of  8 variables:
##  $ StudentID        : int  101 102 103 104 105 106 107
##  $ Name             : chr  "Alice" "Brian" "Cynthia" "David" ...
##  $ Gender           : chr  "F" "M" "F" "M" ...
##  $ Age              : int  20 21 19 22 20 23 21
##  $ Department       : chr  "CS" "IT" "CS" "Math" ...
##  $ MathScore        : int  78 85 90 67 88 74 92
##  $ EnglishScore     : int  82 79 88 70 91 76 95
##  $ AttendancePercent: int  90 85 95 80 92 88 97
#Load SPSS file into RStudio
file.exists("sales.sav") # This statement is for Checking if the file exist
## [1] TRUE
#install.packages("haven") # I commented this command because package installation should be done outside the R Markdown file.
library(haven)
data <- read_sav("sales.sav")
View(data)
str(data)
## tibble [10 × 6] (S3: tbl_df/tbl/data.frame)
##  $ TransactionID: num [1:10] 1 2 3 4 5 6 7 8 9 10
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ Product      : chr [1:10] "Laptop" "Office Chair" "Smartphone" "Desk" ...
##   ..- attr(*, "format.spss")= chr "A12"
##  $ Category     : chr [1:10] "Electronics" "Furniture" "Electronics" "Furniture" ...
##   ..- attr(*, "format.spss")= chr "A11"
##  $ Quantity     : num [1:10] 2 5 3 2 10 1 20 50 4 1
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ UnitPrice    : num [1:10] 750 120 500 200 30 150 2 1 180 300
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ Date         : chr [1:10] "2026-01-10" "2026-01-11" "2026-01-12" "2026-01-13" ...
##   ..- attr(*, "format.spss")= chr "A10"

Assignment 2: Merging Datasets

#importing The second data set called students so that i can merge it with students
library(readxl)
discipline <-read_excel("C:/Users/eniyomufasha/Downloads/discipline.xlsx")
str(discipline)
## tibble [7 × 7] (S3: tbl_df/tbl/data.frame)
##  $ StudentID      : num [1:7] 101 102 103 104 105 106 107
##  $ Name           : chr [1:7] "Alice" "Brian" "Cynthia" "David" ...
##  $ Department     : chr [1:7] "CS" "IT" "CS" "Math" ...
##  $ DisciplineCases: num [1:7] 2 0 5 1 3 4 1
##  $ LateCount      : num [1:7] 3 1 6 2 4 5 2
##  $ BehaviorScore  : num [1:7] 75 90 55 80 65 60 78
##  $ WarningLevel   : chr [1:7] "Medium" "Low" "High" "Low" ...
#Merging using two variables StudentID and Department
merged_data <- merge(students, discipline,
                     by = c("StudentID", "Department"))
nrow(students) # Checking number of rows in students dataset
## [1] 7
nrow(discipline)# Checking number of rows in discipline dataset
## [1] 7
nrow(merged_data)## Checking number of rows for merged data
## [1] 7
str(merged_data)
## 'data.frame':    7 obs. of  13 variables:
##  $ StudentID        : int  101 102 103 104 105 106 107
##  $ Department       : chr  "CS" "IT" "CS" "Math" ...
##  $ Name.x           : chr  "Alice" "Brian" "Cynthia" "David" ...
##  $ Gender           : chr  "F" "M" "F" "M" ...
##  $ Age              : int  20 21 19 22 20 23 21
##  $ MathScore        : int  78 85 90 67 88 74 92
##  $ EnglishScore     : int  82 79 88 70 91 76 95
##  $ AttendancePercent: int  90 85 95 80 92 88 97
##  $ Name.y           : chr  "Alice" "Brian" "Cynthia" "David" ...
##  $ DisciplineCases  : num  2 0 5 1 3 4 1
##  $ LateCount        : num  3 1 6 2 4 5 2
##  $ BehaviorScore    : num  75 90 55 80 65 60 78
##  $ WarningLevel     : chr  "Medium" "Low" "High" "Low" ...

Assignment 3: Applying Group By ,% > %,Select(),filter(),arrange(),rename() and mutate

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Group By,%>%
results<-merged_data %>%
  group_by(Department) %>%
  summarise(
    total_students = n(),
    average_math = mean(MathScore, na.rm = TRUE),
    average_english = mean(EnglishScore, na.rm = TRUE),
    average_attendance = mean(AttendancePercent, na.rm = TRUE),
    average_behavior = mean(BehaviorScore, na.rm = TRUE)
  )
nrow(results) # This command display the number of rows which is the exact number of departments in the datasets
## [1] 3
View(results)
#select() → choose specific columns
students1 <- students %>%
  select(StudentID, Name, Gender, Department,MathScore, EnglishScore)
print(students1)
##   StudentID    Name Gender Department MathScore EnglishScore
## 1       101   Alice      F         CS        78           82
## 2       102   Brian      M         IT        85           79
## 3       103 Cynthia      F         CS        90           88
## 4       104   David      M       Math        67           70
## 5       105     Eva      F         IT        88           91
## 6       106   Frank      M         CS        74           76
## 7       107   Grace      F       Math        92           95
#filter(): For example I am filtering students who scored above 50 in English
students2 <- students1 %>%
  filter(EnglishScore > 75)
print(students1)
##   StudentID    Name Gender Department MathScore EnglishScore
## 1       101   Alice      F         CS        78           82
## 2       102   Brian      M         IT        85           79
## 3       103 Cynthia      F         CS        90           88
## 4       104   David      M       Math        67           70
## 5       105     Eva      F         IT        88           91
## 6       106   Frank      M         CS        74           76
## 7       107   Grace      F       Math        92           95
#arrange():Sort by highest EnglishScore:

students3 <- students2 %>%
  arrange(desc(EnglishScore))
print(students3)
##   StudentID    Name Gender Department MathScore EnglishScore
## 1       107   Grace      F       Math        92           95
## 2       105     Eva      F         IT        88           91
## 3       103 Cynthia      F         CS        90           88
## 4       101   Alice      F         CS        78           82
## 5       102   Brian      M         IT        85           79
## 6       106   Frank      M         CS        74           76
#rename()
students4 <- students3 %>%
  rename(
    St_ID = StudentID,
    English = EnglishScore
  )
print(students4)
##   St_ID    Name Gender Department MathScore English
## 1   107   Grace      F       Math        92      95
## 2   105     Eva      F         IT        88      91
## 3   103 Cynthia      F         CS        90      88
## 4   101   Alice      F         CS        78      82
## 5   102   Brian      M         IT        85      79
## 6   106   Frank      M         CS        74      76
#mutate() → create new variables, here I am creating TotalScore and AVerageScore
students_final <- students4 %>%
  mutate(
    TotalScore = MathScore + English,
    AverageScore = TotalScore / 2
  )
print(students_final)
##   St_ID    Name Gender Department MathScore English TotalScore AverageScore
## 1   107   Grace      F       Math        92      95        187         93.5
## 2   105     Eva      F         IT        88      91        179         89.5
## 3   103 Cynthia      F         CS        90      88        178         89.0
## 4   101   Alice      F         CS        78      82        160         80.0
## 5   102   Brian      M         IT        85      79        164         82.0
## 6   106   Frank      M         CS        74      76        150         75.0
Applying ggplot2
library(ggplot2)
#Math Score by Gender(BocPlot)
ggplot(students, aes(x = Gender, y = MathScore, fill = Gender)) +
  geom_boxplot() +
  labs(title = "Math Scores by Gender",
       x = "Gender",
       y = "Math Score") +
  theme_minimal()

#Attendance vs Performance (Scatter Plot)
ggplot(students, aes(x = AttendancePercent, y = MathScore)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm") +
  labs(title = "Attendance vs Math Performance",
       x = "Attendance (%)",
       y = "Math Score")
## `geom_smooth()` using formula = 'y ~ x'

Assignment 4 : Trace() & Recover()

In R, trace() and recover() are debugging tools. trace() is used to monitor function execution by adding tracing messages, while recover() is used to explore the call stack after an error to identify the source of the problem.

# For example I create a function to calculate average MathScore:
avg_math <- function(data) {
  mean(data$MathScore, na.rm = TRUE)
}
# I can Trace it by
trace("avg_math", tracer = quote(print(data)))
## [1] "avg_math"
avg_math(students)
## Tracing avg_math(students) on entry 
##   StudentID    Name Gender Age Department MathScore EnglishScore
## 1       101   Alice      F  20         CS        78           82
## 2       102   Brian      M  21         IT        85           79
## 3       103 Cynthia      F  19         CS        90           88
## 4       104   David      M  22       Math        67           70
## 5       105     Eva      F  20         IT        88           91
## 6       106   Frank      M  23         CS        74           76
## 7       107   Grace      F  21       Math        92           95
##   AttendancePercent
## 1                90
## 2                85
## 3                95
## 4                80
## 5                92
## 6                88
## 7                97
## [1] 82
#Activate recover mode
options(error = recover)

# I created a function that i know it might fail to show the use of Recover, What i do in recover mode I inspect student,Check variables and where the error occurred
test_function <- function(data) {
  mean(data$MathScore) + mean(data$NonExistingColumn)
}
test_function(students)
## Warning in mean.default(data$NonExistingColumn): argument is not numeric or
## logical: returning NA
## [1] NA

Assignment 5: 1) Make functions that calculate summary statistics and apply it to a variable to show that it works

##2)Make a function to calculate two sample t test, then apply it to a function

summary_stats <- function(x) {
  
  results <- list(
    Mean = mean(x, na.rm = TRUE),
    Median = median(x, na.rm = TRUE),
    Minimum = min(x, na.rm = TRUE),
    Maximum = max(x, na.rm = TRUE),
    Standard_Deviation = sd(x, na.rm = TRUE)
  )
  
  return(results)
}
summary_stats(merged_data$MathScore)
## $Mean
## [1] 82
## 
## $Median
## [1] 85
## 
## $Minimum
## [1] 67
## 
## $Maximum
## [1] 92
## 
## $Standard_Deviation
## [1] 9.255629
t_test_function <- function(score, group) {
  
  result <- t.test(score ~ group)
  
  return(result)
}

t_test_function(merged_data$MathScore,
                merged_data$Gender)
## 
##  Welch Two Sample t-test
## 
## data:  score by group
## t = 1.9151, df = 3.3777, p-value = 0.141
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
##  -6.549973 29.883307
## sample estimates:
## mean in group F mean in group M 
##        87.00000        75.33333

Assignment 6: The use of sapply(), vapply() and mapply()

# Here I Computed the mean of MathScore, EnglishScore, and AttendancePercent columns in my merged_data and it's returning the results as a named vector using sapply()
sapply(merged_data[, c("MathScore",
                       "EnglishScore",
                       "AttendancePercent")],
       mean,
       na.rm = TRUE)
##         MathScore      EnglishScore AttendancePercent 
##          82.00000          83.00000          89.57143
#Here I used vapply() to compute column-wise means for MathScore and EnglishScore in merged_data
vapply(merged_data[, c("MathScore",
                       "EnglishScore")],
       mean,
       numeric(1),
       na.rm = TRUE)
##    MathScore EnglishScore 
##           82           83
## Here I Applied a function to multiple variables simultaneously by adding MathScore and EnglishScore for each student
mapply(function(x, y) x + y,
       merged_data$MathScore,
       merged_data$EnglishScore)
## [1] 160 164 178 137 179 150 187