Performance = read.csv("C:/Users/User/Downloads/Textbooks/StudentsPerformance.csv", header = TRUE)

###To get the first five rows

head(Performance, 5)
##   students_id gender race_ethnicity parental_education_status        lunch
## 1           1 female        group B         bachelor's degree     standard
## 2           2 female        group C              some college     standard
## 3           3 female        group B           master's degree     standard
## 4           4   male        group A        associate's degree free/reduced
## 5           5   male        group C              some college     standard
##   test_preparation_course math_score reading_score writing_score
## 1                    none         72            72            74
## 2               completed         69            90            88
## 3                    none         90            95            93
## 4                    none         47            57            44
## 5                    none         76            78            75

#To get the structure

str(Performance)
## 'data.frame':    1000 obs. of  9 variables:
##  $ students_id              : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ gender                   : chr  "female" "female" "female" "male" ...
##  $ race_ethnicity           : chr  "group B" "group C" "group B" "group A" ...
##  $ parental_education_status: chr  "bachelor's degree" "some college" "master's degree" "associate's degree" ...
##  $ lunch                    : chr  "standard" "standard" "standard" "free/reduced" ...
##  $ test_preparation_course  : chr  "none" "completed" "none" "none" ...
##  $ math_score               : int  72 69 90 47 76 71 88 40 64 38 ...
##  $ reading_score            : int  72 90 95 57 78 83 95 43 64 60 ...
##  $ writing_score            : int  74 88 93 44 75 78 92 39 67 50 ...

#To get the dimension

dim(Performance)
## [1] 1000    9

#To get the number of columns and rows

ncol(Performance)
## [1] 9
nrow(Performance)
## [1] 1000

#To get the column names

colnames(Performance)
## [1] "students_id"               "gender"                   
## [3] "race_ethnicity"            "parental_education_status"
## [5] "lunch"                     "test_preparation_course"  
## [7] "math_score"                "reading_score"            
## [9] "writing_score"

#To select a or some variable

library('tidyverse')
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” dplyr     1.1.2     âś” readr     2.1.4
## âś” forcats   1.0.0     âś” stringr   1.5.0
## âś” ggplot2   3.4.2     âś” tibble    3.2.1
## âś” lubridate 1.9.2     âś” tidyr     1.3.0
## âś” purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Rst1 <- Performance %>% select(lunch)
head(Rst1, 10)
##           lunch
## 1      standard
## 2      standard
## 3      standard
## 4  free/reduced
## 5      standard
## 6      standard
## 7      standard
## 8  free/reduced
## 9  free/reduced
## 10 free/reduced

#To select more than one column

Rst2 <- Performance %>% select(c(math_score, writing_score))
head(Rst2,10)
##    math_score writing_score
## 1          72            74
## 2          69            88
## 3          90            93
## 4          47            44
## 5          76            75
## 6          71            78
## 7          88            92
## 8          40            39
## 9          64            67
## 10         38            50

#To exclude a column

Rst3 <- Performance %>% select(-lunch)
head(Rst3, 10)
##    students_id gender race_ethnicity parental_education_status
## 1            1 female        group B         bachelor's degree
## 2            2 female        group C              some college
## 3            3 female        group B           master's degree
## 4            4   male        group A        associate's degree
## 5            5   male        group C              some college
## 6            6 female        group B        associate's degree
## 7            7 female        group B              some college
## 8            8   male        group B              some college
## 9            9   male        group D               high school
## 10          10 female        group B               high school
##    test_preparation_course math_score reading_score writing_score
## 1                     none         72            72            74
## 2                completed         69            90            88
## 3                     none         90            95            93
## 4                     none         47            57            44
## 5                     none         76            78            75
## 6                     none         71            83            78
## 7                completed         88            95            92
## 8                     none         40            43            39
## 9                completed         64            64            67
## 10                    none         38            60            50

#To exclude more than one column

Rst4 <- Performance %>% select(-c(lunch, math_score, writing_score))
head(Rst4, 10)
##    students_id gender race_ethnicity parental_education_status
## 1            1 female        group B         bachelor's degree
## 2            2 female        group C              some college
## 3            3 female        group B           master's degree
## 4            4   male        group A        associate's degree
## 5            5   male        group C              some college
## 6            6 female        group B        associate's degree
## 7            7 female        group B              some college
## 8            8   male        group B              some college
## 9            9   male        group D               high school
## 10          10 female        group B               high school
##    test_preparation_course reading_score
## 1                     none            72
## 2                completed            90
## 3                     none            95
## 4                     none            57
## 5                     none            78
## 6                     none            83
## 7                completed            95
## 8                     none            43
## 9                completed            64
## 10                    none            60

#To select rows or students with high school qualification

Rst5 <- filter(Performance, parental_education_status=='high school')
head(Rst5, 10)
##    students_id gender race_ethnicity parental_education_status        lunch
## 1            9   male        group D               high school free/reduced
## 2           10 female        group B               high school free/reduced
## 3           13 female        group B               high school     standard
## 4           17   male        group C               high school     standard
## 5           21   male        group D               high school     standard
## 6           29   male        group C               high school     standard
## 7           48 female        group C               high school     standard
## 8           50   male        group C               high school     standard
## 9           54   male        group D               high school     standard
## 10          56 female        group C               high school free/reduced
##    test_preparation_course math_score reading_score writing_score
## 1                completed         64            64            67
## 2                     none         38            60            50
## 3                     none         65            81            73
## 4                     none         88            89            86
## 5                     none         66            69            63
## 6                     none         70            70            65
## 7                     none         66            71            76
## 8                completed         82            84            82
## 9                     none         88            78            75
## 10                    none         33            41            43

#To select students with ethnicity of group A

Rst6 <- filter(Performance, race_ethnicity =='group A')
head(Rst6, 10)
##    students_id gender race_ethnicity parental_education_status        lunch
## 1            4   male        group A        associate's degree free/reduced
## 2           14   male        group A              some college     standard
## 3           15 female        group A           master's degree     standard
## 4           26   male        group A           master's degree free/reduced
## 5           47 female        group A        associate's degree     standard
## 6           62   male        group A          some high school free/reduced
## 7           63   male        group A        associate's degree free/reduced
## 8           73 female        group A        associate's degree free/reduced
## 9           78   male        group A         bachelor's degree     standard
## 10          83   male        group A              some college free/reduced
##    test_preparation_course math_score reading_score writing_score
## 1                     none         47            57            44
## 2                completed         78            72            70
## 3                     none         50            53            58
## 4                     none         73            74            72
## 5                completed         55            65            62
## 6                     none         39            39            34
## 7                     none         62            61            55
## 8                     none         41            51            48
## 9                completed         80            78            81
## 10               completed         50            47            54

#To select female students whose parents have bachelor’s degree

Rst7 <- filter(Performance, c(gender == 'female' & parental_education_status == "bachelor's degree"))
head(Rst7, 10)
##    students_id gender race_ethnicity parental_education_status        lunch
## 1            1 female        group B         bachelor's degree     standard
## 2           28 female        group C         bachelor's degree     standard
## 3           91 female        group C         bachelor's degree     standard
## 4          100 female        group D         bachelor's degree     standard
## 5          115 female        group E         bachelor's degree     standard
## 6          117 female        group B         bachelor's degree free/reduced
## 7          118 female        group D         bachelor's degree     standard
## 8          121 female        group C         bachelor's degree     standard
## 9          130 female        group A         bachelor's degree     standard
## 10         149 female        group D         bachelor's degree     standard
##    test_preparation_course math_score reading_score writing_score
## 1                     none         72            72            74
## 2                     none         67            69            75
## 3                     none         65            72            74
## 4                     none         65            67            62
## 5                completed         99           100           100
## 6                     none         75            85            82
## 7                     none         78            82            79
## 8                completed         79            92            89
## 9                     none         51            49            51
## 10               completed         68            75            81

#To get the minimum scores of the courses

Performance %>% summarise(c(min(reading_score), min(math_score), min(writing_score)))
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
##   c(min(reading_score), min(math_score), min(writing_score))
## 1                                                         17
## 2                                                          0
## 3                                                         10

#To get the maximum scores of the courses

Performance %>% summarise(c(max(reading_score), max(math_score), max(writing_score)))
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
##   c(max(reading_score), max(math_score), max(writing_score))
## 1                                                        100
## 2                                                        100
## 3                                                        100

#To select rows with students who are female and whose parents are high school and maths score is greater than 80

Rst8 <- Performance %>% filter(gender == 'female', parental_education_status == 'high school') %>% select(math_score)
head(Rst8, 10)
##    math_score
## 1          38
## 2          65
## 3          66
## 4          33
## 5          87
## 6          66
## 7          68
## 8          46
## 9          50
## 10         42

#To get the gender and race of students who score more than 80 in maths

Rst9 <- Performance %>% filter(math_score > 80) %>% select(gender, race_ethnicity)
head(Rst9, 10)
##    gender race_ethnicity
## 1  female        group B
## 2  female        group B
## 3    male        group C
## 4    male        group E
## 5    male        group E
## 6    male        group C
## 7    male        group D
## 8  female        group E
## 9  female        group D
## 10   male        group C

#To select students who score greater than 80 in maths and less than 80 in reading based on gender and parent_educational_status

Rst10 <- Performance %>% filter(math_score > 80 & reading_score < 80) %>% select(gender, parental_education_status)
head(Rst10, 10)
##    gender parental_education_status
## 1    male               high school
## 2    male               high school
## 3    male              some college
## 4    male              some college
## 5    male        associate's degree
## 6    male         bachelor's degree
## 7    male        associate's degree
## 8    male        associate's degree
## 9    male         bachelor's degree
## 10   male         bachelor's degree

#MUTATE #To get the avarage score of the three courses

Performance.New = Performance %>% mutate(Average_scrore = (math_score + reading_score + writing_score)/3)
head(Performance.New, 10)
##    students_id gender race_ethnicity parental_education_status        lunch
## 1            1 female        group B         bachelor's degree     standard
## 2            2 female        group C              some college     standard
## 3            3 female        group B           master's degree     standard
## 4            4   male        group A        associate's degree free/reduced
## 5            5   male        group C              some college     standard
## 6            6 female        group B        associate's degree     standard
## 7            7 female        group B              some college     standard
## 8            8   male        group B              some college free/reduced
## 9            9   male        group D               high school free/reduced
## 10          10 female        group B               high school free/reduced
##    test_preparation_course math_score reading_score writing_score
## 1                     none         72            72            74
## 2                completed         69            90            88
## 3                     none         90            95            93
## 4                     none         47            57            44
## 5                     none         76            78            75
## 6                     none         71            83            78
## 7                completed         88            95            92
## 8                     none         40            43            39
## 9                completed         64            64            67
## 10                    none         38            60            50
##    Average_scrore
## 1        72.66667
## 2        82.33333
## 3        92.66667
## 4        49.33333
## 5        76.33333
## 6        77.33333
## 7        91.66667
## 8        40.66667
## 9        65.00000
## 10       49.33333

#To check for the minimum value of the average score

Performance.New %>% summarise(min(Average_scrore))
##   min(Average_scrore)
## 1                   9

#To check for the maximum value of the average score

Performance.New %>% summarise(max(Average_scrore))
##   max(Average_scrore)
## 1                 100

#To create a new categorical variable of 4-levels

Students_Perf = Performance.New %>% mutate(performance_status = case_when(
                      Average_scrore > 69 ~ "Excellent",
                      Average_scrore > 49 ~ "Credit",
                      Average_scrore > 30 ~ "Pass",
                      TRUE ~ "Fail")) %>%
                    mutate(performance_status = as.factor(performance_status))
head(Students_Perf, 10)
##    students_id gender race_ethnicity parental_education_status        lunch
## 1            1 female        group B         bachelor's degree     standard
## 2            2 female        group C              some college     standard
## 3            3 female        group B           master's degree     standard
## 4            4   male        group A        associate's degree free/reduced
## 5            5   male        group C              some college     standard
## 6            6 female        group B        associate's degree     standard
## 7            7 female        group B              some college     standard
## 8            8   male        group B              some college free/reduced
## 9            9   male        group D               high school free/reduced
## 10          10 female        group B               high school free/reduced
##    test_preparation_course math_score reading_score writing_score
## 1                     none         72            72            74
## 2                completed         69            90            88
## 3                     none         90            95            93
## 4                     none         47            57            44
## 5                     none         76            78            75
## 6                     none         71            83            78
## 7                completed         88            95            92
## 8                     none         40            43            39
## 9                completed         64            64            67
## 10                    none         38            60            50
##    Average_scrore performance_status
## 1        72.66667          Excellent
## 2        82.33333          Excellent
## 3        92.66667          Excellent
## 4        49.33333             Credit
## 5        76.33333          Excellent
## 6        77.33333          Excellent
## 7        91.66667          Excellent
## 8        40.66667               Pass
## 9        65.00000             Credit
## 10       49.33333             Credit

#To check the structure of the new variable

str(Students_Perf)
## 'data.frame':    1000 obs. of  11 variables:
##  $ students_id              : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ gender                   : chr  "female" "female" "female" "male" ...
##  $ race_ethnicity           : chr  "group B" "group C" "group B" "group A" ...
##  $ parental_education_status: chr  "bachelor's degree" "some college" "master's degree" "associate's degree" ...
##  $ lunch                    : chr  "standard" "standard" "standard" "free/reduced" ...
##  $ test_preparation_course  : chr  "none" "completed" "none" "none" ...
##  $ math_score               : int  72 69 90 47 76 71 88 40 64 38 ...
##  $ reading_score            : int  72 90 95 57 78 83 95 43 64 60 ...
##  $ writing_score            : int  74 88 93 44 75 78 92 39 67 50 ...
##  $ Average_scrore           : num  72.7 82.3 92.7 49.3 76.3 ...
##  $ performance_status       : Factor w/ 4 levels "Credit","Excellent",..: 2 2 2 1 2 2 2 4 1 1 ...

#To check the mean weight of the each courses based on the performance status

Students_Perf %>% group_by(performance_status) %>% summarise(mean(math_score), mean(reading_score), mean(writing_score))
## # A tibble: 4 Ă— 4
##   performance_status `mean(math_score)` `mean(reading_score)`
##   <fct>                           <dbl>                 <dbl>
## 1 Credit                           58.9                  61.9
## 2 Excellent                        77.7                  81.1
## 3 Fail                             22.1                  27.9
## 4 Pass                             43.4                  44.6
## # ℹ 1 more variable: `mean(writing_score)` <dbl>

#Interpretation: students with credit pass had a mean weight of 58.85, those with excellent had 77.79, those with fail had 22.10 and students with a pass had 43.42. Most of the students passed excellently.

#To a frequency count of students based on their performance

Students_Perf %>% count(performance_status) %>% arrange(n)
##   performance_status   n
## 1               Fail  10
## 2               Pass  86
## 3             Credit 430
## 4          Excellent 474

#Interpretation: This was a very good performance. About 474 students had an excellent performance, 430 had a credit credit pass, 86 had a pass and 10 failed.

#To check the frequency count of students based on their performance and gender

Students_Perf %>% count(performance_status, gender) %>% arrange(n)
##   performance_status gender   n
## 1               Fail   male   4
## 2               Fail female   6
## 3               Pass female  30
## 4               Pass   male  56
## 5          Excellent   male 199
## 6             Credit female 207
## 7             Credit   male 223
## 8          Excellent female 275

#Interpretation: The result of the analysis showed that about 275 female students had an excellent performance, 199 male students performed excellently, 223 male students had a credit pass, 207 female students had a credit pass, 56 male students had a pass, 30 female students had a pass. Finally, 6 female students failed and 4 male students failed.

#To a frequency count of students based on their performance, gender and race

Rst11 <- Students_Perf %>% count(performance_status, gender, race_ethnicity) %>% arrange(n)
head(Rst11,10)
##    performance_status gender race_ethnicity n
## 1                Fail   male        group A 1
## 2                Fail   male        group B 1
## 3                Fail   male        group C 1
## 4                Fail   male        group E 1
## 5                Fail female        group C 2
## 6                Pass   male        group E 3
## 7                Fail female        group B 4
## 8                Pass female        group A 4
## 9                Pass female        group D 4
## 10               Pass female        group E 5
library('ggplot2')

#To plot a barchart of the performance status

P1 = ggplot(data = Students_Perf) + 
      geom_bar(mapping = aes(x = performance_status))
P1

#Interpretation: The bar chart also showed that the highest performance was an excellent performance, while the least was a fail.

#To plot a barchart of performance status based on gender

P2 = ggplot(data = Students_Perf, aes(x = performance_status, color = gender)) + 
        geom_bar(fill = 'red', alpha = 0.1, position = 'identity')
P2

#To plot the boxplot of the students’ Average score based on gender

P3 = ggplot(data = Students_Perf) + 
      geom_boxplot(mapping = aes(x = gender, y = Average_scrore), outlier.colour = 'green', outlier.shape = 19, outlier.size = 1.5, fill = 'blue')
P3

#Interpretation: About 25% of the female students had an average score below 60, while the male students had an average score below 55.

#To plot the histogram showing the distribution of the students’ Average score

P4 = ggplot(data = Students_Perf) + 
      geom_histogram(mapping = aes(x = Average_scrore))
P4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Interpretation: The histogram showed the distribution of the average scores of the students. It is actually normal though skewed to the left. Most of the average scores are between 50 and 90. Therefore, only very few students had average scores below 30 and few had average scores between 95 and 100.

#To plot the histogram showing the distribution of the students’ Average score

P5 = ggplot(data = Students_Perf, aes(x = Average_scrore, color = gender)) + 
      geom_histogram(fill = 'red', binwidth = 3, alpha = 0.1 )
P5

#Interpretation: The results showed that the females had a higher disdtribution than their male counterpart. This was because the average scores of the female students are higher than the males.

#Bar chart showing the frequency distribution of performance status based on gender

P8 = ggplot(data = Students_Perf) + 
    geom_bar(mapping = aes(x = performance_status, fill = gender), position = "dodge")
P8

#Interpretation: The results from the bar chart showed that more male students had a credit pass than the females. Most of the female students had an excellent performance than the male students. More male students had a pass than the females.