School Data Analysis Using R Programming Language

This data is from a Bridge Kenya programme. You can find the raw data in this folder: Source Data

Solutions

install R packages for importing the project

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.1.8
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors

library(scales)

## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

library(janitor)

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

import xlsx files into R studio

library(readxl)
Pupil_attendance <- read_excel("C:/Users/hp/Desktop/Excel Format (.xlsx)/Pupil attendance.xlsx")
Pupil_scores <- read_excel("C:/Users/hp/Desktop/Excel Format (.xlsx)/Pupil scores.xlsx")
School_information <- read_excel("C:/Users/hp/Desktop/Excel Format (.xlsx)/School_information.xlsx")
Lesson_completion <- read_excel("C:/Users/hp/Desktop/Excel Format (.xlsx)/Lesson completion.xlsx")

Let’s display a few rows to explore the data set

head(Pupil_attendance)

## # A tibble: 6 × 5
##   school_id pupil_id grade   attendance_records present_records
##       <dbl>    <dbl> <chr>                <dbl>           <dbl>
## 1     35175        1 Grade 1                 91              69
## 2     40580        7 Grade 2                 92              86
## 3      9342        8 Grade 5                 43              39
## 4    858450       10 Grade 5                 86              62
## 5       792       13 Grade 3                104              81
## 6    324884       14 Grade 4                 90              67

head(Pupil_scores)

## # A tibble: 6 × 5
##   school_id pupil_id grade   subject     score
##       <dbl>    <dbl> <chr>   <chr>       <dbl>
## 1     35175        1 Grade 1 Fluency    65    
## 2     35175        1 Grade 1 Kiswahili   0.943
## 3     35175        1 Grade 1 Math        1    
## 4     40580        7 Grade 2 Math        0.933
## 5     40580        7 Grade 2 Kiswahili   0.943
## 6     40580        7 Grade 2 Fluency   117

head(School_information)

## # A tibble: 6 × 4
##   region  province school_id tutoring_program
##   <chr>   <chr>        <dbl> <chr>           
## 1 Mombasa Coast       136992 No              
## 2 Kilifi  Coast       687400 Yes             
## 3 Mombasa Coast       609982 Yes             
## 4 Eastern Eastern     223941 No              
## 5 Isiolo  Eastern      34092 No              
## 6 Isiolo  Eastern      46684 No

head(Lesson_completion)

## # A tibble: 6 × 4
##   school_id teacher_id grade   lesson_completion_rate
##       <dbl>      <dbl> <chr>                    <dbl>
## 1       416        505 Grade 1                  0.568
## 2       416        202 Grade 2                  0.681
## 3       416        124 Grade 3                  0.250
## 4       416        516 Grade 4                  0.359
## 5       416        145 Grade 5                  0.397
## 6       792        545 Grade 1                  0.809

Question 1 Creating a Master Dataset for pupil scores

each variable should form a column from the pupil_scores data

Pupil_scores <- pivot_wider(Pupil_scores, names_from = subject, values_from = score)
head(Pupil_scores)

## # A tibble: 6 × 6
##   school_id pupil_id grade   Fluency Kiswahili  Math
##       <dbl>    <dbl> <chr>     <dbl>     <dbl> <dbl>
## 1     35175        1 Grade 1      65     0.943 1    
## 2     40580        7 Grade 2     117     0.943 0.933
## 3      9342        8 Grade 5     144     0.850 0.700
## 4    858450       10 Grade 5     211     1     0.720
## 5       792       13 Grade 3     221     0.857 0.967
## 6    324884       14 Grade 4     267     0.921 0.900

merge school info and pupil scores on school_id

Pupil_scores <- merge(Pupil_scores, School_information, by="school_id")

merge Pupil_score with Pupil_attendance

Pupil_scores <- inner_join(Pupil_scores, Pupil_attendance, by = c("school_id", "pupil_id", "grade"))

merge Pupil_score with Lesson_completion

Pupil_scores <- inner_join(Pupil_scores, Lesson_completion, by = c("school_id", "grade"))

drop N/A values

Pupil_scores <- drop_na(Pupil_scores)
head(Pupil_scores)

##   school_id pupil_id   grade Fluency  Kiswahili      Math    region province
## 1       416    25509 Grade 2      46 0.65714288 0.4666667 Kirinyaga  Central
## 2       416    19156 Grade 1       2 0.11428571 0.8666667 Kirinyaga  Central
## 3       416     6768 Grade 1      17 0.08571429 0.7000000 Kirinyaga  Central
## 4       416    21072 Grade 4     101 0.92000002 0.8800000 Kirinyaga  Central
## 5       416    14895 Grade 1      35 0.65714288 0.9333333 Kirinyaga  Central
## 6       416    26676 Grade 4     102 0.81999999 0.8200000 Kirinyaga  Central
##   tutoring_program attendance_records present_records teacher_id
## 1               No                 87              57        202
## 2               No                 38              31        505
## 3               No                 85              69        505
## 4               No                 99              91        516
## 5               No                 85              56        505
## 6               No                 99              83        516
##   lesson_completion_rate
## 1              0.6806084
## 2              0.5684008
## 3              0.5684008
## 4              0.3591549
## 5              0.5684008
## 6              0.3591549

save new Pupil_scores as main dataset in new csv file

write.csv(Pupil_scores, file= "Pupil_scores.csv")

Question 2 Percentage of pupils present in class for “all pupils”

Avg_percent_pupils_present <- round(mean(Pupil_scores$present_records) /
                              mean(Pupil_scores$attendance_records), 2)
percent(Avg_percent_pupils_present)

## [1] "77%"

This shows an average of 77% of the students were present in class for all pupils

Percentage of pupils present across “each” schools let’s know how many schools we have

n_distinct(Pupil_scores$school_id)

## [1] 111

this shows that we have 111 distinct schools

create a dataframe grouping by these 111 schools i.e school_id

Schools <- Pupil_attendance %>%
              group_by(school_id) %>%
              summarise (attendance_record = sum(attendance_records),
                         present_record = sum(present_records))
head(Schools)

## # A tibble: 6 × 3
##   school_id attendance_record present_record
##       <dbl>             <dbl>          <dbl>
## 1       416              6840           5198
## 2       792              9491           7146
## 3      5502              7873           5255
## 4      7956             10750           8249
## 5      8084             15679          10710
## 6      9342              7178           5489

In percentage

percent(round(mean(Schools$present_record) /
                mean(Schools$attendance_record), 2))

## [1] "76%"

This shows an average of 76% of the students were present in class across each schools Not much difference in percentages

Question 3 Average Fluency Scores of each Grades in a chart

creating a dataframe “grades_fluency” for grades and fluency score

grades_fluency <- Pupil_scores %>% select(grade, Fluency)
head(grades_fluency)

##     grade Fluency
## 1 Grade 2      46
## 2 Grade 1       2
## 3 Grade 1      17
## 4 Grade 4     101
## 5 Grade 1      35
## 6 Grade 4     102

average fleuncy scores by grade

grades_fluency_avg <- grades_fluency %>%
                      group_by(grade)%>%
                      summarise(avg_fluency_score = round(mean(Fluency),2))
head(grades_fluency_avg)

## # A tibble: 5 × 2
##   grade   avg_fluency_score
##   <chr>               <dbl>
## 1 Grade 1              53.2
## 2 Grade 2             105. 
## 3 Grade 3             128. 
## 4 Grade 4             146. 
## 5 Grade 5             155.

bar chart showing average fluency score

ggplot(grades_fluency_avg, aes(avg_fluency_score, grade)) + 
  geom_bar(stat= "identity", fill = "lightblue") + 
  labs(title="Average Fluency Scores By Grades", y="grades", x="average fluency score") +
  geom_text(aes(label= avg_fluency_score), hjust=1.2, colour="white")

Question 4 Region having highest & lowest fluency score across all grades select the region and fluency column from the pupil_scores to a new dataframe

region_fluency <- Pupil_scores %>% select(region, Fluency)
head(region_fluency)

##      region Fluency
## 1 Kirinyaga      46
## 2 Kirinyaga       2
## 3 Kirinyaga      17
## 4 Kirinyaga     101
## 5 Kirinyaga      35
## 6 Kirinyaga     102

group by region and average fleuncy score

region_fluency_avg <- region_fluency %>%
  group_by(region)%>%
  summarise(avg_fluency_score = round(mean(Fluency), 2)) %>%
  arrange(-avg_fluency_score)

Top 5 regions with highest fluency scores

region_highest <- head (region_fluency_avg, 3)
head(region_highest)

## # A tibble: 3 × 2
##   region        avg_fluency_score
##   <chr>                     <dbl>
## 1 Machakos                   159.
## 2 Siaya                      145.
## 3 Tharaka nithi              145.

the result shows that region Machakos had the highest average fluency score of 159.15 across all grades with Siaya and Tharaka nithi coming second an third with scores of 145.02 and 144.58 respectively

Region having lowest fluency scores

region_lowest <- tail (region_fluency_avg, 3)
head(region_lowest)

## # A tibble: 3 × 2
##   region    avg_fluency_score
##   <chr>                 <dbl>
## 1 Nairobi                87.7
## 2 Bomet                  68.4
## 3 Kirinyaga              60.3

Regions with lowest fluency scores include Nairobi, Bomet and Kirinyaga with Kirinyaga having the lowest fluency score of 60.31

Question 5 i)

creating dataframe for grade, fluency and school_id

df<- Pupil_scores %>%
  select(grade,  Fluency, school_id)
head(df)

##     grade Fluency school_id
## 1 Grade 2      46       416
## 2 Grade 1       2       416
## 3 Grade 1      17       416
## 4 Grade 4     101       416
## 5 Grade 1      35       416
## 6 Grade 4     102       416

add a column based on if fluency is less than 10 return 1, else 0

df <- mutate(df,fluency_range = if_else(Fluency <= 10, 1, 0))
head(df)

##     grade Fluency school_id fluency_range
## 1 Grade 2      46       416             0
## 2 Grade 1       2       416             1
## 3 Grade 1      17       416             0
## 4 Grade 4     101       416             0
## 5 Grade 1      35       416             0
## 6 Grade 4     102       416             0

dg<- df %>%
  group_by(grade) %>%
  summarise(sum_fluency_range = sum(fluency_range == 1))
dg

## # A tibble: 5 × 2
##   grade   sum_fluency_range
##   <chr>               <int>
## 1 Grade 1               286
## 2 Grade 2                88
## 3 Grade 3                52
## 4 Grade 4                38
## 5 Grade 5                18

chart to show share of pupils under this threshold

ggplot(dg, aes(grade, sum_fluency_range)) + 
  geom_bar(stat= "identity", fill = "lightblue") + 
  labs(title="Grades by Fluency score of 10 and Lower ", x="grades", y="count") +
  geom_text(aes(label= sum_fluency_range), vjust=1.2, colour="white")

school having the highest share of pupils scoring under this threshold (<=10) in grade 3

dt <- df %>%
  group_by(grade, school_id) %>%
  summarise(sum_fluency_range = sum(fluency_range == 1))%>%
  filter(grade == "Grade 3") %>%
  arrange(-sum_fluency_range)

## `summarise()` has grouped output by 'grade'. You can override using the
## `.groups` argument.

head (dt, 3)

## # A tibble: 3 × 3
## # Groups:   grade [1]
##   grade   school_id sum_fluency_range
##   <chr>       <dbl>             <int>
## 1 Grade 3    223941                12
## 2 Grade 3    206948                 5
## 3 Grade 3     46528                 4

this shows that school_id “223941” had the highest share of pupil scoring <= 10 in grade 3

Question 6 Spread of lesson completion rate by teachers and teachers to be rewarded

visualization on a histogram chart

ggplot(Pupil_scores, aes(x = lesson_completion_rate)) +
  geom_histogram(fill="lightblue") +
  labs(title="Chart showing spread of Lesson Completion Rate")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

this histogram chart is rightly skewed and shows that most lesson completion rates by teachers ranges from 0.65 to 0.8. Generally lesson completion rate were higher compared to lower rates of 0.00 to 0.25

teachers to be rewarded

teacher_completion <- Pupil_scores %>%
  group_by(teacher_id, school_id) %>%
  summarise(lesson_completion_rates = round(sum(lesson_completion_rate), 2)) %>%
  arrange(-lesson_completion_rates)

## `summarise()` has grouped output by 'teacher_id'. You can override using the
## `.groups` argument.

head(teacher_completion, 3)

## # A tibble: 3 × 3
## # Groups:   teacher_id [3]
##   teacher_id school_id lesson_completion_rates
##        <dbl>     <dbl>                   <dbl>
## 1        194     10904                    38.8
## 2        126    239870                    38.6
## 3         65    687400                    38.6

this shows that teacher id 194, 126, and 65 had the highest sum of lesson completion rate respectively they stand the chance to be rewarded with their corresponding schools.

Question 7 Region with Highest and Lowest number of pupils and schools

region_data <- Pupil_scores %>%
  group_by(region) %>%
  summarise(no.of_pupils = n_distinct(pupil_id),
            no.of_schools = n_distinct(school_id)) %>%
  arrange(-no.of_pupils)
head(region_data, 3)

## # A tibble: 3 × 3
##   region  no.of_pupils no.of_schools
##   <chr>          <int>         <int>
## 1 Mombasa         1641            15
## 2 Nairobi         1401            11
## 3 Kwale           1031             9

The above result shows the top 3 regions with highest number of pupils and number of schools. Mombasa having the highest number of pupils across the kenya dataset with a total of 1641 pupils and 15 number of schools

region_data <- Pupil_scores %>%
  group_by(region) %>%
  summarise(no.of_pupils = n_distinct(pupil_id),
            no.of_schools = n_distinct(school_id)) %>%
  arrange(-no.of_pupils)
tail(region_data, 3)

## # A tibble: 3 × 3
##   region      no.of_pupils no.of_schools
##   <chr>              <int>         <int>
## 1 Kakamega              60             1
## 2 Machakos              54             1
## 3 Trans nzoia           46             1

This result shows regions with lowest number of pupils and schools across all regions. They all had 1 school each and Trans nzoia region having by far the lowest number of 46 pupils to a school.