Performance = read.csv("C:/Users/User/Downloads/Textbooks/StudentsPerformance.csv", header = TRUE)
###To get the first five rows
head(Performance, 5)
## students_id gender race_ethnicity parental_education_status lunch
## 1 1 female group B bachelor's degree standard
## 2 2 female group C some college standard
## 3 3 female group B master's degree standard
## 4 4 male group A associate's degree free/reduced
## 5 5 male group C some college standard
## test_preparation_course math_score reading_score writing_score
## 1 none 72 72 74
## 2 completed 69 90 88
## 3 none 90 95 93
## 4 none 47 57 44
## 5 none 76 78 75
#To get the structure
str(Performance)
## 'data.frame': 1000 obs. of 9 variables:
## $ students_id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ gender : chr "female" "female" "female" "male" ...
## $ race_ethnicity : chr "group B" "group C" "group B" "group A" ...
## $ parental_education_status: chr "bachelor's degree" "some college" "master's degree" "associate's degree" ...
## $ lunch : chr "standard" "standard" "standard" "free/reduced" ...
## $ test_preparation_course : chr "none" "completed" "none" "none" ...
## $ math_score : int 72 69 90 47 76 71 88 40 64 38 ...
## $ reading_score : int 72 90 95 57 78 83 95 43 64 60 ...
## $ writing_score : int 74 88 93 44 75 78 92 39 67 50 ...
#To get the dimension
dim(Performance)
## [1] 1000 9
#To get the number of columns and rows
ncol(Performance)
## [1] 9
nrow(Performance)
## [1] 1000
#To get the column names
colnames(Performance)
## [1] "students_id" "gender"
## [3] "race_ethnicity" "parental_education_status"
## [5] "lunch" "test_preparation_course"
## [7] "math_score" "reading_score"
## [9] "writing_score"
#To select a or some variable
library('tidyverse')
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” dplyr 1.1.2 âś” readr 2.1.4
## âś” forcats 1.0.0 âś” stringr 1.5.0
## âś” ggplot2 3.4.2 âś” tibble 3.2.1
## âś” lubridate 1.9.2 âś” tidyr 1.3.0
## âś” purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Rst1 <- Performance %>% select(lunch)
head(Rst1, 10)
## lunch
## 1 standard
## 2 standard
## 3 standard
## 4 free/reduced
## 5 standard
## 6 standard
## 7 standard
## 8 free/reduced
## 9 free/reduced
## 10 free/reduced
#To select more than one column
Rst2 <- Performance %>% select(c(math_score, writing_score))
head(Rst2,10)
## math_score writing_score
## 1 72 74
## 2 69 88
## 3 90 93
## 4 47 44
## 5 76 75
## 6 71 78
## 7 88 92
## 8 40 39
## 9 64 67
## 10 38 50
#To exclude a column
Rst3 <- Performance %>% select(-lunch)
head(Rst3, 10)
## students_id gender race_ethnicity parental_education_status
## 1 1 female group B bachelor's degree
## 2 2 female group C some college
## 3 3 female group B master's degree
## 4 4 male group A associate's degree
## 5 5 male group C some college
## 6 6 female group B associate's degree
## 7 7 female group B some college
## 8 8 male group B some college
## 9 9 male group D high school
## 10 10 female group B high school
## test_preparation_course math_score reading_score writing_score
## 1 none 72 72 74
## 2 completed 69 90 88
## 3 none 90 95 93
## 4 none 47 57 44
## 5 none 76 78 75
## 6 none 71 83 78
## 7 completed 88 95 92
## 8 none 40 43 39
## 9 completed 64 64 67
## 10 none 38 60 50
#To exclude more than one column
Rst4 <- Performance %>% select(-c(lunch, math_score, writing_score))
head(Rst4, 10)
## students_id gender race_ethnicity parental_education_status
## 1 1 female group B bachelor's degree
## 2 2 female group C some college
## 3 3 female group B master's degree
## 4 4 male group A associate's degree
## 5 5 male group C some college
## 6 6 female group B associate's degree
## 7 7 female group B some college
## 8 8 male group B some college
## 9 9 male group D high school
## 10 10 female group B high school
## test_preparation_course reading_score
## 1 none 72
## 2 completed 90
## 3 none 95
## 4 none 57
## 5 none 78
## 6 none 83
## 7 completed 95
## 8 none 43
## 9 completed 64
## 10 none 60
#To select rows or students with high school qualification
Rst5 <- filter(Performance, parental_education_status=='high school')
head(Rst5, 10)
## students_id gender race_ethnicity parental_education_status lunch
## 1 9 male group D high school free/reduced
## 2 10 female group B high school free/reduced
## 3 13 female group B high school standard
## 4 17 male group C high school standard
## 5 21 male group D high school standard
## 6 29 male group C high school standard
## 7 48 female group C high school standard
## 8 50 male group C high school standard
## 9 54 male group D high school standard
## 10 56 female group C high school free/reduced
## test_preparation_course math_score reading_score writing_score
## 1 completed 64 64 67
## 2 none 38 60 50
## 3 none 65 81 73
## 4 none 88 89 86
## 5 none 66 69 63
## 6 none 70 70 65
## 7 none 66 71 76
## 8 completed 82 84 82
## 9 none 88 78 75
## 10 none 33 41 43
#To select students with ethnicity of group A
Rst6 <- filter(Performance, race_ethnicity =='group A')
head(Rst6, 10)
## students_id gender race_ethnicity parental_education_status lunch
## 1 4 male group A associate's degree free/reduced
## 2 14 male group A some college standard
## 3 15 female group A master's degree standard
## 4 26 male group A master's degree free/reduced
## 5 47 female group A associate's degree standard
## 6 62 male group A some high school free/reduced
## 7 63 male group A associate's degree free/reduced
## 8 73 female group A associate's degree free/reduced
## 9 78 male group A bachelor's degree standard
## 10 83 male group A some college free/reduced
## test_preparation_course math_score reading_score writing_score
## 1 none 47 57 44
## 2 completed 78 72 70
## 3 none 50 53 58
## 4 none 73 74 72
## 5 completed 55 65 62
## 6 none 39 39 34
## 7 none 62 61 55
## 8 none 41 51 48
## 9 completed 80 78 81
## 10 completed 50 47 54
#To select female students whose parents have bachelor’s degree
Rst7 <- filter(Performance, c(gender == 'female' & parental_education_status == "bachelor's degree"))
head(Rst7, 10)
## students_id gender race_ethnicity parental_education_status lunch
## 1 1 female group B bachelor's degree standard
## 2 28 female group C bachelor's degree standard
## 3 91 female group C bachelor's degree standard
## 4 100 female group D bachelor's degree standard
## 5 115 female group E bachelor's degree standard
## 6 117 female group B bachelor's degree free/reduced
## 7 118 female group D bachelor's degree standard
## 8 121 female group C bachelor's degree standard
## 9 130 female group A bachelor's degree standard
## 10 149 female group D bachelor's degree standard
## test_preparation_course math_score reading_score writing_score
## 1 none 72 72 74
## 2 none 67 69 75
## 3 none 65 72 74
## 4 none 65 67 62
## 5 completed 99 100 100
## 6 none 75 85 82
## 7 none 78 82 79
## 8 completed 79 92 89
## 9 none 51 49 51
## 10 completed 68 75 81
#To get the minimum scores of the courses
Performance %>% summarise(c(min(reading_score), min(math_score), min(writing_score)))
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## c(min(reading_score), min(math_score), min(writing_score))
## 1 17
## 2 0
## 3 10
#To get the maximum scores of the courses
Performance %>% summarise(c(max(reading_score), max(math_score), max(writing_score)))
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## c(max(reading_score), max(math_score), max(writing_score))
## 1 100
## 2 100
## 3 100
#To select rows with students who are female and whose parents are high school and maths score is greater than 80
Rst8 <- Performance %>% filter(gender == 'female', parental_education_status == 'high school') %>% select(math_score)
head(Rst8, 10)
## math_score
## 1 38
## 2 65
## 3 66
## 4 33
## 5 87
## 6 66
## 7 68
## 8 46
## 9 50
## 10 42
#To get the gender and race of students who score more than 80 in maths
Rst9 <- Performance %>% filter(math_score > 80) %>% select(gender, race_ethnicity)
head(Rst9, 10)
## gender race_ethnicity
## 1 female group B
## 2 female group B
## 3 male group C
## 4 male group E
## 5 male group E
## 6 male group C
## 7 male group D
## 8 female group E
## 9 female group D
## 10 male group C
#To select students who score greater than 80 in maths and less than 80 in reading based on gender and parent_educational_status
Rst10 <- Performance %>% filter(math_score > 80 & reading_score < 80) %>% select(gender, parental_education_status)
head(Rst10, 10)
## gender parental_education_status
## 1 male high school
## 2 male high school
## 3 male some college
## 4 male some college
## 5 male associate's degree
## 6 male bachelor's degree
## 7 male associate's degree
## 8 male associate's degree
## 9 male bachelor's degree
## 10 male bachelor's degree
#MUTATE #To get the avarage score of the three courses
Performance.New = Performance %>% mutate(Average_scrore = (math_score + reading_score + writing_score)/3)
head(Performance.New, 10)
## students_id gender race_ethnicity parental_education_status lunch
## 1 1 female group B bachelor's degree standard
## 2 2 female group C some college standard
## 3 3 female group B master's degree standard
## 4 4 male group A associate's degree free/reduced
## 5 5 male group C some college standard
## 6 6 female group B associate's degree standard
## 7 7 female group B some college standard
## 8 8 male group B some college free/reduced
## 9 9 male group D high school free/reduced
## 10 10 female group B high school free/reduced
## test_preparation_course math_score reading_score writing_score
## 1 none 72 72 74
## 2 completed 69 90 88
## 3 none 90 95 93
## 4 none 47 57 44
## 5 none 76 78 75
## 6 none 71 83 78
## 7 completed 88 95 92
## 8 none 40 43 39
## 9 completed 64 64 67
## 10 none 38 60 50
## Average_scrore
## 1 72.66667
## 2 82.33333
## 3 92.66667
## 4 49.33333
## 5 76.33333
## 6 77.33333
## 7 91.66667
## 8 40.66667
## 9 65.00000
## 10 49.33333
#To check for the minimum value of the average score
Performance.New %>% summarise(min(Average_scrore))
## min(Average_scrore)
## 1 9
#To check for the maximum value of the average score
Performance.New %>% summarise(max(Average_scrore))
## max(Average_scrore)
## 1 100
#To create a new categorical variable of 4-levels
Students_Perf = Performance.New %>% mutate(performance_status = case_when(
Average_scrore > 69 ~ "Excellent",
Average_scrore > 49 ~ "Credit",
Average_scrore > 30 ~ "Pass",
TRUE ~ "Fail")) %>%
mutate(performance_status = as.factor(performance_status))
head(Students_Perf, 10)
## students_id gender race_ethnicity parental_education_status lunch
## 1 1 female group B bachelor's degree standard
## 2 2 female group C some college standard
## 3 3 female group B master's degree standard
## 4 4 male group A associate's degree free/reduced
## 5 5 male group C some college standard
## 6 6 female group B associate's degree standard
## 7 7 female group B some college standard
## 8 8 male group B some college free/reduced
## 9 9 male group D high school free/reduced
## 10 10 female group B high school free/reduced
## test_preparation_course math_score reading_score writing_score
## 1 none 72 72 74
## 2 completed 69 90 88
## 3 none 90 95 93
## 4 none 47 57 44
## 5 none 76 78 75
## 6 none 71 83 78
## 7 completed 88 95 92
## 8 none 40 43 39
## 9 completed 64 64 67
## 10 none 38 60 50
## Average_scrore performance_status
## 1 72.66667 Excellent
## 2 82.33333 Excellent
## 3 92.66667 Excellent
## 4 49.33333 Credit
## 5 76.33333 Excellent
## 6 77.33333 Excellent
## 7 91.66667 Excellent
## 8 40.66667 Pass
## 9 65.00000 Credit
## 10 49.33333 Credit
#To check the structure of the new variable
str(Students_Perf)
## 'data.frame': 1000 obs. of 11 variables:
## $ students_id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ gender : chr "female" "female" "female" "male" ...
## $ race_ethnicity : chr "group B" "group C" "group B" "group A" ...
## $ parental_education_status: chr "bachelor's degree" "some college" "master's degree" "associate's degree" ...
## $ lunch : chr "standard" "standard" "standard" "free/reduced" ...
## $ test_preparation_course : chr "none" "completed" "none" "none" ...
## $ math_score : int 72 69 90 47 76 71 88 40 64 38 ...
## $ reading_score : int 72 90 95 57 78 83 95 43 64 60 ...
## $ writing_score : int 74 88 93 44 75 78 92 39 67 50 ...
## $ Average_scrore : num 72.7 82.3 92.7 49.3 76.3 ...
## $ performance_status : Factor w/ 4 levels "Credit","Excellent",..: 2 2 2 1 2 2 2 4 1 1 ...
#To check the mean weight of the each courses based on the performance status
Students_Perf %>% group_by(performance_status) %>% summarise(mean(math_score), mean(reading_score), mean(writing_score))
## # A tibble: 4 Ă— 4
## performance_status `mean(math_score)` `mean(reading_score)`
## <fct> <dbl> <dbl>
## 1 Credit 58.9 61.9
## 2 Excellent 77.7 81.1
## 3 Fail 22.1 27.9
## 4 Pass 43.4 44.6
## # ℹ 1 more variable: `mean(writing_score)` <dbl>
#Interpretation: students with credit pass had a mean weight of 58.85, those with excellent had 77.79, those with fail had 22.10 and students with a pass had 43.42. Most of the students passed excellently.
#To a frequency count of students based on their performance
Students_Perf %>% count(performance_status) %>% arrange(n)
## performance_status n
## 1 Fail 10
## 2 Pass 86
## 3 Credit 430
## 4 Excellent 474
#Interpretation: This was a very good performance. About 474 students had an excellent performance, 430 had a credit credit pass, 86 had a pass and 10 failed.
#To check the frequency count of students based on their performance and gender
Students_Perf %>% count(performance_status, gender) %>% arrange(n)
## performance_status gender n
## 1 Fail male 4
## 2 Fail female 6
## 3 Pass female 30
## 4 Pass male 56
## 5 Excellent male 199
## 6 Credit female 207
## 7 Credit male 223
## 8 Excellent female 275
#Interpretation: The result of the analysis showed that about 275 female students had an excellent performance, 199 male students performed excellently, 223 male students had a credit pass, 207 female students had a credit pass, 56 male students had a pass, 30 female students had a pass. Finally, 6 female students failed and 4 male students failed.
#To a frequency count of students based on their performance, gender and race
Rst11 <- Students_Perf %>% count(performance_status, gender, race_ethnicity) %>% arrange(n)
head(Rst11,10)
## performance_status gender race_ethnicity n
## 1 Fail male group A 1
## 2 Fail male group B 1
## 3 Fail male group C 1
## 4 Fail male group E 1
## 5 Fail female group C 2
## 6 Pass male group E 3
## 7 Fail female group B 4
## 8 Pass female group A 4
## 9 Pass female group D 4
## 10 Pass female group E 5
library('ggplot2')
#To plot a barchart of the performance status
P1 = ggplot(data = Students_Perf) +
geom_bar(mapping = aes(x = performance_status))
P1
#Interpretation: The bar chart also showed that the highest performance
was an excellent performance, while the least was a fail.
#To plot a barchart of performance status based on gender
P2 = ggplot(data = Students_Perf, aes(x = performance_status, color = gender)) +
geom_bar(fill = 'red', alpha = 0.1, position = 'identity')
P2
#To plot the boxplot of the students’ Average score based on gender
P3 = ggplot(data = Students_Perf) +
geom_boxplot(mapping = aes(x = gender, y = Average_scrore), outlier.colour = 'green', outlier.shape = 19, outlier.size = 1.5, fill = 'blue')
P3
#Interpretation: About 25% of the female students had an average score
below 60, while the male students had an average score below 55.
#To plot the histogram showing the distribution of the students’ Average score
P4 = ggplot(data = Students_Perf) +
geom_histogram(mapping = aes(x = Average_scrore))
P4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Interpretation: The histogram showed the distribution of the average
scores of the students. It is actually normal though skewed to the left.
Most of the average scores are between 50 and 90. Therefore, only very
few students had average scores below 30 and few had average scores
between 95 and 100.
#To plot the histogram showing the distribution of the students’ Average score
P5 = ggplot(data = Students_Perf, aes(x = Average_scrore, color = gender)) +
geom_histogram(fill = 'red', binwidth = 3, alpha = 0.1 )
P5
#Interpretation: The results showed that the females had a higher
disdtribution than their male counterpart. This was because the average
scores of the female students are higher than the males.
#Bar chart showing the frequency distribution of performance status based on gender
P8 = ggplot(data = Students_Perf) +
geom_bar(mapping = aes(x = performance_status, fill = gender), position = "dodge")
P8
#Interpretation: The results from the bar chart showed that more male
students had a credit pass than the females. Most of the female students
had an excellent performance than the male students. More male students
had a pass than the females.