TASK

This case study involves tasks that involve four crucial skills for our analyst: (1) data cleaning, (2) calculation of key performance indicators (KPIs), (3) descriptive analysis, and (4) impact evaluation.

STEPS INCLUDES:

Step 1: Data cleaning

Step 2: Calculating KPIs

Step 3: Descriptives

Step 4: Writing a Memo

Load Packages

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.1.8
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(readxl)
library(haven)
library(magrittr)
## 
## Attaching package: 'magrittr'
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
getwd()
## [1] "C:/Users/HP/Desktop/Casestudy/Excel Format (.xlsx)"
setwd("C:/Users/HP/Desktop/Casestudy/Excel Format (.xlsx)")

import Dataset

library(readxl)
Lesson_completion <- read_excel("Lesson completion.xlsx")
Pupil_attendance <- read_excel("Pupil attendance.xlsx")
Pupil_scores <- read_excel("Pupil scores.xlsx")
School_information <- read_excel("School_information.xlsx")

columns names

colnames(Lesson_completion)
## [1] "school_id"              "teacher_id"             "grade"                 
## [4] "lesson_completion_rate"
colnames(Pupil_attendance)
## [1] "school_id"          "pupil_id"           "grade"             
## [4] "attendance_records" "present_records"
colnames(Pupil_scores)
## [1] "school_id" "pupil_id"  "grade"     "subject"   "score"
colnames(School_information)
## [1] "region"           "province"         "school_id"        "tutoring_program"
## Verifying the structures
str(Lesson_completion)
## tibble [552 × 4] (S3: tbl_df/tbl/data.frame)
##  $ school_id             : num [1:552] 416 416 416 416 416 792 792 792 792 792 ...
##  $ teacher_id            : num [1:552] 505 202 124 516 145 545 201 669 258 259 ...
##  $ grade                 : chr [1:552] "Grade 1" "Grade 2" "Grade 3" "Grade 4" ...
##  $ lesson_completion_rate: num [1:552] 0.568 0.681 0.25 0.359 0.397 ...
str(Pupil_attendance)
## tibble [12,701 × 5] (S3: tbl_df/tbl/data.frame)
##  $ school_id         : num [1:12701] 35175 40580 9342 858450 792 ...
##  $ pupil_id          : num [1:12701] 1 7 8 10 13 14 16 18 27 34 ...
##  $ grade             : chr [1:12701] "Grade 1" "Grade 2" "Grade 5" "Grade 5" ...
##  $ attendance_records: num [1:12701] 91 92 43 86 104 90 81 90 97 85 ...
##  $ present_records   : num [1:12701] 69 86 39 62 81 67 65 80 77 59 ...
str(Pupil_scores)
## tibble [37,947 × 5] (S3: tbl_df/tbl/data.frame)
##  $ school_id: num [1:37947] 35175 35175 35175 40580 40580 ...
##  $ pupil_id : num [1:37947] 1 1 1 7 7 7 8 8 8 10 ...
##  $ grade    : chr [1:37947] "Grade 1" "Grade 1" "Grade 1" "Grade 2" ...
##  $ subject  : chr [1:37947] "Fluency" "Kiswahili" "Math" "Math" ...
##  $ score    : num [1:37947] 65 0.943 1 0.933 0.943 ...
str(School_information)
## tibble [111 × 4] (S3: tbl_df/tbl/data.frame)
##  $ region          : chr [1:111] "Mombasa" "Kilifi" "Mombasa" "Eastern" ...
##  $ province        : chr [1:111] "Coast" "Coast" "Coast" "Eastern" ...
##  $ school_id       : num [1:111] 136992 687400 609982 223941 34092 ...
##  $ tutoring_program: chr [1:111] "No" "Yes" "Yes" "No" ...

Step 1: Data cleaning

# Check for missing values
# And all these
# is.na(Lesson_completion)
# is.na(Pupil_attendance)
# is.na(Pupil_scores)
# is.na(School_information) were properly checked and no missing Value found

# Checking for extra and blank space
mutate_all(Lesson_completion, trimws)
## # A tibble: 552 × 4
##    school_id teacher_id grade   lesson_completion_rate
##    <chr>     <chr>      <chr>   <chr>                 
##  1 416       505        Grade 1 0.568400770712909     
##  2 416       202        Grade 2 0.680608365019011     
##  3 416       124        Grade 3 0.250460405156538     
##  4 416       516        Grade 4 0.359154929577465     
##  5 416       145        Grade 5 0.396761133603239     
##  6 792       545        Grade 1 0.809248554913295     
##  7 792       201        Grade 2 0.876425855513308     
##  8 792       669        Grade 3 0.882136279926335     
##  9 792       258        Grade 4 0.79874213836478      
## 10 792       259        Grade 5 0.964285714285714     
## # … with 542 more rows
mutate_all(Pupil_attendance, trimws)
## # A tibble: 12,701 × 5
##    school_id pupil_id grade   attendance_records present_records
##    <chr>     <chr>    <chr>   <chr>              <chr>          
##  1 35175     1        Grade 1 91                 69             
##  2 40580     7        Grade 2 92                 86             
##  3 9342      8        Grade 5 43                 39             
##  4 858450    10       Grade 5 86                 62             
##  5 792       13       Grade 3 104                81             
##  6 324884    14       Grade 4 90                 67             
##  7 230373    16       Grade 3 81                 65             
##  8 958934    18       Grade 4 90                 80             
##  9 359640    27       Grade 3 97                 77             
## 10 359640    34       Grade 4 85                 59             
## # … with 12,691 more rows
mutate_all(Pupil_scores, trimws)
## # A tibble: 37,947 × 5
##    school_id pupil_id grade   subject   score            
##    <chr>     <chr>    <chr>   <chr>     <chr>            
##  1 35175     1        Grade 1 Fluency   65               
##  2 35175     1        Grade 1 Kiswahili 0.942857146263123
##  3 35175     1        Grade 1 Math      1                
##  4 40580     7        Grade 2 Math      0.933333337306976
##  5 40580     7        Grade 2 Kiswahili 0.942857146263123
##  6 40580     7        Grade 2 Fluency   117              
##  7 9342      8        Grade 5 Kiswahili 0.850000023841858
##  8 9342      8        Grade 5 Math      0.699999988079071
##  9 9342      8        Grade 5 Fluency   144              
## 10 858450    10       Grade 5 Fluency   211              
## # … with 37,937 more rows
mutate_all(School_information, trimws)
## # A tibble: 111 × 4
##    region  province school_id tutoring_program
##    <chr>   <chr>    <chr>     <chr>           
##  1 Mombasa Coast    136992    No              
##  2 Kilifi  Coast    687400    Yes             
##  3 Mombasa Coast    609982    Yes             
##  4 Eastern Eastern  223941    No              
##  5 Isiolo  Eastern  34092     No              
##  6 Isiolo  Eastern  46684     No              
##  7 Kilifi  Coast    323877    Yes             
##  8 Nairobi Nairobi  47025     No              
##  9 Busia   Western  909600    Yes             
## 10 Kilifi  Coast    25434     No              
## # … with 101 more rows
any(duplicated(Lesson_completion))
## [1] FALSE
any(duplicated(Pupil_attendance))
## [1] FALSE
any(duplicated(Pupil_scores))
## [1] FALSE
any(duplicated(School_information))
## [1] FALSE
# Duplicate values found and was removed

Clean and tidy the pupil_scores data by pivoting the data from long to wide format, so that there is only one row per pupil

pupil_scores_tidy <- Pupil_scores %>%
  pivot_wider(names_from = subject, values_from = score) %>%
  rename(math_score = Math,
         fluency_score = Fluency,
         kiswahili_score = Kiswahili)

Check for duplicates in Lesson_completion

duplicated_rows_lc <- Lesson_completion[duplicated(Lesson_completion$school_id),]

duplicated_rows_pa <- Pupil_attendance[duplicated(Pupil_attendance$school_id),]

# Remove duplicates from both tables
Lesson_completion_unique <- Lesson_completion[!duplicated(Lesson_completion$school_id),]

Pupil_attendance_unique <- Pupil_attendance[!duplicated(Pupil_attendance$school_id),]

Join the tables using the unique columns

Merge the lesson_completion data with the pupil_attendance data by matching on teacher_id:

teacher_attendance <- Lesson_completion_unique %>%
  inner_join(Pupil_attendance_unique, by = "school_id")

Merge the teacher_attendance data with the pupil_scores_tidy data by matching on school_id and pupil_id:

pupil_data <- teacher_attendance %>%
  inner_join(pupil_scores_tidy, by = c("school_id", "pupil_id"))

Merge the pupil_data with the school_info data by matching on school_id:

final_data <- pupil_data %>%
  inner_join(School_information, by = "school_id")

Step 2: Calculating KPIs

Calculating the key performance indicators (KPIs) for analysis, such as average attendance rate, average lesson completion rate, and average scores by subject:

The tibble provides insights into the:

Therefore, it’s interesting to note that there are missing values for some grades, indicating that data may not have been collected for all subjects or schools. Overall, the tibble provides a useful summary of attendance, lesson completion, and academic performance by grade.

final_data %>%
  group_by(grade) %>%
  summarize(avg_attendance_rate = mean(present_records / attendance_records),
            avg_lesson_completion_rate = mean(lesson_completion_rate),
            avg_math_score = mean(math_score),
            avg_fluency_score = mean(fluency_score),
            avg_kiswahili_score = mean(kiswahili_score))
## # A tibble: 5 × 6
##   grade   avg_attendance_rate avg_lesson_completion_rate avg_m…¹ avg_f…² avg_k…³
##   <chr>                 <dbl>                      <dbl>   <dbl>   <dbl>   <dbl>
## 1 Grade 1               0.740                      0.684   0.886    52.1   0.630
## 2 Grade 2               0.765                      0.635  NA        NA    NA    
## 3 Grade 3               0.750                      0.642   0.735   116.    0.783
## 4 Grade 4               0.815                      0.692   0.656   152.    0.763
## 5 Grade 5               0.777                      0.570  NA        NA     0.834
## # … with abbreviated variable names ¹​avg_math_score, ²​avg_fluency_score,
## #   ³​avg_kiswahili_score

Step 3: Descriptives

Perform descriptive analysis to gain insights into the data, such as visualizing the distribution of scores by subject and grade

final_data %>%
ggplot(aes(x = math_score, fill = factor(grade))) +
geom_histogram(binwidth = 10, na.rm = TRUE) +
facet_wrap(~ grade) +
labs(title = "Distribution of Math Scores by Grade",
x = "Math Score", y = "Frequency")

final_data %>%
filter(!is.na(fluency_score), is.numeric(fluency_score)) %>%
ggplot(aes(x = fluency_score, fill = factor(grade))) +
geom_histogram(binwidth = 10, color = "black") +
facet_wrap(~ grade) +
labs(title = "Distribution of Fluency Scores by Grade",
x = "Fluency Score", y = "Frequency",
fill = "Grade")

final_data %>%
  filter(!is.na(kiswahili_score), is.numeric(kiswahili_score)) %>%
  ggplot(aes(x = kiswahili_score, fill = factor(grade))) +
  geom_histogram(binwidth = 10, color = "black", alpha = 0.5) +
  facet_wrap(~ grade) +
  labs(title = "Distribution of Kiswahili Scores by Grade",
       x = "Kiswahili Score", y = "Frequency",
       fill = "Grade") +
  scale_fill_discrete(name = "Grade")

Performing impact evaluation

Performing impact evaluation by comparing the performance of schools that received tutoring program treatment to those that did not, using a t-test or other statistical tests:

Recommendations:

t.test(final_data$math_score ~ final_data$tutoring_program)
## 
##  Welch Two Sample t-test
## 
## data:  final_data$math_score by final_data$tutoring_program
## t = -0.66786, df = 106.68, p-value = 0.5057
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
##  -0.10808188  0.05360999
## sample estimates:
##  mean in group No mean in group Yes 
##         0.7351515         0.7623875

Subset the data to include only schools that received treatment or no treatment:

  • Subsetting the data to include only schools that received treatment or no treatment can be useful for analyzing the impact of the treatment on the outcomes of interest.

  • By excluding schools that received different or additional interventions, it helps to ensure that any observed differences in outcomes are likely due to the specific treatment being evaluated rather than other factors.

  • This approach can help to improve the internal validity of the analysis and provide more confidence in the findings.

treatment_data <- final_data %>% filter(tutoring_program == "treatment")
control_data <- final_data %>% filter(tutoring_program == "control")