# Importing a csv file and store it in a variable data
students <- read.csv("C:/Users/eniyomufasha/Downloads/student_data.csv")
View(students)
str(students)
## 'data.frame': 7 obs. of 8 variables:
## $ StudentID : int 101 102 103 104 105 106 107
## $ Name : chr "Alice" "Brian" "Cynthia" "David" ...
## $ Gender : chr "F" "M" "F" "M" ...
## $ Age : int 20 21 19 22 20 23 21
## $ Department : chr "CS" "IT" "CS" "Math" ...
## $ MathScore : int 78 85 90 67 88 74 92
## $ EnglishScore : int 82 79 88 70 91 76 95
## $ AttendancePercent: int 90 85 95 80 92 88 97
#Load SPSS file into RStudio
file.exists("sales.sav") # This statement is for Checking if the file exist
## [1] TRUE
#install.packages("haven") # I commented this command because package installation should be done outside the R Markdown file.
library(haven)
data <- read_sav("sales.sav")
View(data)
str(data)
## tibble [10 × 6] (S3: tbl_df/tbl/data.frame)
## $ TransactionID: num [1:10] 1 2 3 4 5 6 7 8 9 10
## ..- attr(*, "format.spss")= chr "F8.2"
## $ Product : chr [1:10] "Laptop" "Office Chair" "Smartphone" "Desk" ...
## ..- attr(*, "format.spss")= chr "A12"
## $ Category : chr [1:10] "Electronics" "Furniture" "Electronics" "Furniture" ...
## ..- attr(*, "format.spss")= chr "A11"
## $ Quantity : num [1:10] 2 5 3 2 10 1 20 50 4 1
## ..- attr(*, "format.spss")= chr "F8.2"
## $ UnitPrice : num [1:10] 750 120 500 200 30 150 2 1 180 300
## ..- attr(*, "format.spss")= chr "F8.2"
## $ Date : chr [1:10] "2026-01-10" "2026-01-11" "2026-01-12" "2026-01-13" ...
## ..- attr(*, "format.spss")= chr "A10"
#importing The second data set called students so that i can merge it with students
library(readxl)
discipline <-read_excel("C:/Users/eniyomufasha/Downloads/discipline.xlsx")
str(discipline)
## tibble [7 × 7] (S3: tbl_df/tbl/data.frame)
## $ StudentID : num [1:7] 101 102 103 104 105 106 107
## $ Name : chr [1:7] "Alice" "Brian" "Cynthia" "David" ...
## $ Department : chr [1:7] "CS" "IT" "CS" "Math" ...
## $ DisciplineCases: num [1:7] 2 0 5 1 3 4 1
## $ LateCount : num [1:7] 3 1 6 2 4 5 2
## $ BehaviorScore : num [1:7] 75 90 55 80 65 60 78
## $ WarningLevel : chr [1:7] "Medium" "Low" "High" "Low" ...
#Merging using two variables StudentID and Department
merged_data <- merge(students, discipline,
by = c("StudentID", "Department"))
nrow(students) # Checking number of rows in students dataset
## [1] 7
nrow(discipline)# Checking number of rows in discipline dataset
## [1] 7
nrow(merged_data)## Checking number of rows for merged data
## [1] 7
str(merged_data)
## 'data.frame': 7 obs. of 13 variables:
## $ StudentID : int 101 102 103 104 105 106 107
## $ Department : chr "CS" "IT" "CS" "Math" ...
## $ Name.x : chr "Alice" "Brian" "Cynthia" "David" ...
## $ Gender : chr "F" "M" "F" "M" ...
## $ Age : int 20 21 19 22 20 23 21
## $ MathScore : int 78 85 90 67 88 74 92
## $ EnglishScore : int 82 79 88 70 91 76 95
## $ AttendancePercent: int 90 85 95 80 92 88 97
## $ Name.y : chr "Alice" "Brian" "Cynthia" "David" ...
## $ DisciplineCases : num 2 0 5 1 3 4 1
## $ LateCount : num 3 1 6 2 4 5 2
## $ BehaviorScore : num 75 90 55 80 65 60 78
## $ WarningLevel : chr "Medium" "Low" "High" "Low" ...
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Group By,%>%
results<-merged_data %>%
group_by(Department) %>%
summarise(
total_students = n(),
average_math = mean(MathScore, na.rm = TRUE),
average_english = mean(EnglishScore, na.rm = TRUE),
average_attendance = mean(AttendancePercent, na.rm = TRUE),
average_behavior = mean(BehaviorScore, na.rm = TRUE)
)
nrow(results) # This command display the number of rows which is the exact number of departments in the datasets
## [1] 3
View(results)
#select() → choose specific columns
students1 <- students %>%
select(StudentID, Name, Gender, Department,MathScore, EnglishScore)
print(students1)
## StudentID Name Gender Department MathScore EnglishScore
## 1 101 Alice F CS 78 82
## 2 102 Brian M IT 85 79
## 3 103 Cynthia F CS 90 88
## 4 104 David M Math 67 70
## 5 105 Eva F IT 88 91
## 6 106 Frank M CS 74 76
## 7 107 Grace F Math 92 95
#filter(): For example I am filtering students who scored above 50 in English
students2 <- students1 %>%
filter(EnglishScore > 75)
print(students1)
## StudentID Name Gender Department MathScore EnglishScore
## 1 101 Alice F CS 78 82
## 2 102 Brian M IT 85 79
## 3 103 Cynthia F CS 90 88
## 4 104 David M Math 67 70
## 5 105 Eva F IT 88 91
## 6 106 Frank M CS 74 76
## 7 107 Grace F Math 92 95
#arrange():Sort by highest EnglishScore:
students3 <- students2 %>%
arrange(desc(EnglishScore))
print(students3)
## StudentID Name Gender Department MathScore EnglishScore
## 1 107 Grace F Math 92 95
## 2 105 Eva F IT 88 91
## 3 103 Cynthia F CS 90 88
## 4 101 Alice F CS 78 82
## 5 102 Brian M IT 85 79
## 6 106 Frank M CS 74 76
#rename()
students4 <- students3 %>%
rename(
St_ID = StudentID,
English = EnglishScore
)
print(students4)
## St_ID Name Gender Department MathScore English
## 1 107 Grace F Math 92 95
## 2 105 Eva F IT 88 91
## 3 103 Cynthia F CS 90 88
## 4 101 Alice F CS 78 82
## 5 102 Brian M IT 85 79
## 6 106 Frank M CS 74 76
#mutate() → create new variables, here I am creating TotalScore and AVerageScore
students_final <- students4 %>%
mutate(
TotalScore = MathScore + English,
AverageScore = TotalScore / 2
)
print(students_final)
## St_ID Name Gender Department MathScore English TotalScore AverageScore
## 1 107 Grace F Math 92 95 187 93.5
## 2 105 Eva F IT 88 91 179 89.5
## 3 103 Cynthia F CS 90 88 178 89.0
## 4 101 Alice F CS 78 82 160 80.0
## 5 102 Brian M IT 85 79 164 82.0
## 6 106 Frank M CS 74 76 150 75.0
library(ggplot2)
#Math Score by Gender(BocPlot)
ggplot(students, aes(x = Gender, y = MathScore, fill = Gender)) +
geom_boxplot() +
labs(title = "Math Scores by Gender",
x = "Gender",
y = "Math Score") +
theme_minimal()
#Attendance vs Performance (Scatter Plot)
ggplot(students, aes(x = AttendancePercent, y = MathScore)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm") +
labs(title = "Attendance vs Math Performance",
x = "Attendance (%)",
y = "Math Score")
## `geom_smooth()` using formula = 'y ~ x'
In R, trace() and recover() are debugging tools. trace() is used to monitor function execution by adding tracing messages, while recover() is used to explore the call stack after an error to identify the source of the problem.
# For example I create a function to calculate average MathScore:
avg_math <- function(data) {
mean(data$MathScore, na.rm = TRUE)
}
# I can Trace it by
trace("avg_math", tracer = quote(print(data)))
## [1] "avg_math"
avg_math(students)
## Tracing avg_math(students) on entry
## StudentID Name Gender Age Department MathScore EnglishScore
## 1 101 Alice F 20 CS 78 82
## 2 102 Brian M 21 IT 85 79
## 3 103 Cynthia F 19 CS 90 88
## 4 104 David M 22 Math 67 70
## 5 105 Eva F 20 IT 88 91
## 6 106 Frank M 23 CS 74 76
## 7 107 Grace F 21 Math 92 95
## AttendancePercent
## 1 90
## 2 85
## 3 95
## 4 80
## 5 92
## 6 88
## 7 97
## [1] 82
#Activate recover mode
options(error = recover)
# I created a function that i know it might fail to show the use of Recover, What i do in recover mode I inspect student,Check variables and where the error occurred
test_function <- function(data) {
mean(data$MathScore) + mean(data$NonExistingColumn)
}
test_function(students)
## Warning in mean.default(data$NonExistingColumn): argument is not numeric or
## logical: returning NA
## [1] NA
##2)Make a function to calculate two sample t test, then apply it to a function
summary_stats <- function(x) {
results <- list(
Mean = mean(x, na.rm = TRUE),
Median = median(x, na.rm = TRUE),
Minimum = min(x, na.rm = TRUE),
Maximum = max(x, na.rm = TRUE),
Standard_Deviation = sd(x, na.rm = TRUE)
)
return(results)
}
summary_stats(merged_data$MathScore)
## $Mean
## [1] 82
##
## $Median
## [1] 85
##
## $Minimum
## [1] 67
##
## $Maximum
## [1] 92
##
## $Standard_Deviation
## [1] 9.255629
t_test_function <- function(score, group) {
result <- t.test(score ~ group)
return(result)
}
t_test_function(merged_data$MathScore,
merged_data$Gender)
##
## Welch Two Sample t-test
##
## data: score by group
## t = 1.9151, df = 3.3777, p-value = 0.141
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
## -6.549973 29.883307
## sample estimates:
## mean in group F mean in group M
## 87.00000 75.33333
# Here I Computed the mean of MathScore, EnglishScore, and AttendancePercent columns in my merged_data and it's returning the results as a named vector using sapply()
sapply(merged_data[, c("MathScore",
"EnglishScore",
"AttendancePercent")],
mean,
na.rm = TRUE)
## MathScore EnglishScore AttendancePercent
## 82.00000 83.00000 89.57143
#Here I used vapply() to compute column-wise means for MathScore and EnglishScore in merged_data
vapply(merged_data[, c("MathScore",
"EnglishScore")],
mean,
numeric(1),
na.rm = TRUE)
## MathScore EnglishScore
## 82 83
## Here I Applied a function to multiple variables simultaneously by adding MathScore and EnglishScore for each student
mapply(function(x, y) x + y,
merged_data$MathScore,
merged_data$EnglishScore)
## [1] 160 164 178 137 179 150 187