library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.4 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## Warning: package 'ggplot2' was built under R version 3.6.3
## Warning: package 'tibble' was built under R version 3.6.3
## Warning: package 'tidyr' was built under R version 3.6.3
## Warning: package 'readr' was built under R version 3.6.3
## Warning: package 'purrr' was built under R version 3.6.3
## Warning: package 'dplyr' was built under R version 3.6.3
## Warning: package 'forcats' was built under R version 3.6.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(ggplot2)
library(plotly)
## Warning: package 'plotly' was built under R version 3.6.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
getwd()
## [1] "C:/Users/Maggie/Documents"
StudentsPerformance <- read_csv("C:/Users/Maggie/Downloads/StudentsPerformance.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## gender = col_character(),
## `race/ethnicity` = col_character(),
## `parental level of education` = col_character(),
## lunch = col_character(),
## `test preparation course` = col_character(),
## `math score` = col_double(),
## `reading score` = col_double(),
## `writing score` = col_double()
## )
view(StudentsPerformance)
summary(StudentsPerformance)
## gender race/ethnicity parental level of education
## Length:1000 Length:1000 Length:1000
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## lunch test preparation course math score reading score
## Length:1000 Length:1000 Min. : 0.00 Min. : 17.00
## Class :character Class :character 1st Qu.: 57.00 1st Qu.: 59.00
## Mode :character Mode :character Median : 66.00 Median : 70.00
## Mean : 66.09 Mean : 69.17
## 3rd Qu.: 77.00 3rd Qu.: 79.00
## Max. :100.00 Max. :100.00
## writing score
## Min. : 10.00
## 1st Qu.: 57.75
## Median : 69.00
## Mean : 68.05
## 3rd Qu.: 79.00
## Max. :100.00
head(StudentsPerformance)
## # A tibble: 6 x 8
## gender `race/ethnicity` `parental level~ lunch `test preparati~ `math score`
## <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 female group B bachelor's degr~ stan~ none 72
## 2 female group C some college stan~ completed 69
## 3 female group B master's degree stan~ none 90
## 4 male group A associate's deg~ free~ none 47
## 5 male group C some college stan~ none 76
## 6 female group B associate's deg~ stan~ none 71
## # ... with 2 more variables: `reading score` <dbl>, `writing score` <dbl>
dim(StudentsPerformance)
## [1] 1000 8
any(is.na(StudentsPerformance))
## [1] FALSE
Test_Preparation_Course <- table(StudentsPerformance$`test preparation course`)
barplot(Test_Preparation_Course, col = "light blue", main = "Barplot of Test Preparation Course")
Parental_Level_of_Education <- table(StudentsPerformance$`parental level of education`)
barplot(Parental_Level_of_Education, main = "Number of Parents in Each Education Level")
ggplot(StudentsPerformance, aes(StudentsPerformance$`race/ethnicity`, fill = gender)) +
geom_bar(position = "dodge") +
xlab("Race/Ethnicity") +
ggtitle( "Barplot of Students in Each Race/Ethnicity Group by Gender")
boxplot(StudentsPerformance[, c("math score", "reading score", "writing score")])
BOXPLOT <- StudentsPerformance %>%
select("gender", "math score", "reading score", "writing score") %>%
pivot_longer(., cols = c("math score", "reading score", "writing score"), names_to = "Tests", values_to = "Score") %>%
ggplot(aes(x = Tests, y = Score)) +
geom_boxplot() +
theme_dark()+
theme(panel.background = element_rect(fill = "light blue"))+
ggtitle("Boxplot of Scores by Gender")+
facet_wrap(~gender)
ggplotly(BOXPLOT)
## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
StudentsPerformance2 <- mutate(StudentsPerformance, Overall_Grade = (StudentsPerformance$`math score` + StudentsPerformance$`reading score` + StudentsPerformance$`writing score`) / 3)
StudentsPerformance2
## # A tibble: 1,000 x 9
## gender `race/ethnicity` `parental level~ lunch `test preparati~ `math score`
## <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 female group B bachelor's degr~ stan~ none 72
## 2 female group C some college stan~ completed 69
## 3 female group B master's degree stan~ none 90
## 4 male group A associate's deg~ free~ none 47
## 5 male group C some college stan~ none 76
## 6 female group B associate's deg~ stan~ none 71
## 7 female group B some college stan~ completed 88
## 8 male group B some college free~ none 40
## 9 male group D high school free~ completed 64
## 10 female group B high school free~ none 38
## # ... with 990 more rows, and 3 more variables: `reading score` <dbl>, `writing
## # score` <dbl>, Overall_Grade <dbl>
filter(StudentsPerformance2, Overall_Grade == 100.00)
## # A tibble: 3 x 9
## gender `race/ethnicity` `parental level~ lunch `test preparati~ `math score`
## <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 female group E bachelor's degr~ stan~ none 100
## 2 male group E bachelor's degr~ stan~ completed 100
## 3 female group E associate's deg~ stan~ none 100
## # ... with 3 more variables: `reading score` <dbl>, `writing score` <dbl>,
## # Overall_Grade <dbl>
ggplot(StudentsPerformance2, aes(x = StudentsPerformance2$`parental level of education`, y = StudentsPerformance2$Overall_Grade )) +
geom_boxplot() +
labs(x = "Parental Level of Education", y = "Student's Overall Grade", title = "Boxplot comparing parental level of education to student's overall grade") +
facet_wrap(~gender)
ggplot(StudentsPerformance2, aes(x = StudentsPerformance2$`test preparation course`, fill = gender)) +
geom_bar(position = "dodge")+
labs(x = "Test Preparation Course" , title = "Test Preparation Course by Gender")
boxplot(StudentsPerformance2$Overall_Grade ~ StudentsPerformance2$gender, xlab = "Gender", ylab = "Overall Grade", main = "Overall Grade by Gender")
table(StudentsPerformance2$gender)
##
## female male
## 518 482
hist(StudentsPerformance2$Overall_Grade, main = "Histogram of Students' Overall Grade", xlab = "Overall Grade", col = "blue")
StudentsPerformance3 <- mutate(StudentsPerformance2, Letter_Grade = cut(StudentsPerformance2$Overall_Grade, breaks = c(0,59.99,69.99,79.99,89.99,100), labels = c( "F", "D", "C", "B", "A"), right = FALSE, include.lowest = TRUE))
view(StudentsPerformance3)
ggplot(StudentsPerformance3, aes(x = StudentsPerformance3$Letter_Grade, fill = gender)) +
geom_bar(position = "dodge") +
facet_wrap(~ StudentsPerformance3$`race/ethnicity`)+
xlab("Letter Grade")
mosaicplot(StudentsPerformance3$Letter_Grade ~ StudentsPerformance3$`test preparation course`, main = "Mosaicplot of Letter Grade and Test Preparation Course", xlab = "Letter Grade", ylab ="Test Preparation Course")
Select <- StudentsPerformance3 %>%
select(1:2, 5, 10)
Table <- table(Select$gender, Select$`race/ethnicity`,Select$`test preparation course`, Select$Letter_Grade)
MP <- mosaicplot(Table)
GGPLOT <- ggplot(StudentsPerformance3, aes(x = StudentsPerformance3$`race/ethnicity`, fill = StudentsPerformance3$Letter_Grade))+
geom_bar(position = "dodge") +
facet_wrap(StudentsPerformance3$`test preparation course` ~StudentsPerformance3$gender)+
theme_dark()+
theme(panel.background = element_rect(fill = "#2D2D2D"), legend.background = element_rect(fill = "#2D2D2D"), legend.title = element_text(size = 8))+
scale_fill_viridis_d()+
labs(title = "Comparing Student Letter Grade by Race/Ethnicity, Gender and Whether They Had Taken a Test Preparation Course", y = "Count", x = "Race/Ethnicity", fill = "Letter Grade", face = "bold")
ggplotly(GGPLOT) %>%
config(displayModeBar = FALSE)
Essay: My dataset is Students Performance on Exams from Kaggle. It is a fictional dataset, meaning the data in this dataset is inaccurate. The dataset documented student’s math, reading, and writing scores. Besides the quantitative variables(math, reading, and writing scores), the dataset also includes five categorical variables such as gender(gender of the student), race/ethnicity(five different groups), parental level of education(parent’s educational level of the student), lunch(whether the student has standard or free/reduced lunch), and test preparation course(whether or not the student has taken a test preparation course). I was able to find out that there are 1000 rows and 8 columns of data in this dataset by using the dim function. I started exploring the dataset by using the summary, head, and any(is.na) function to look at the summary, the variables, and the first 6 rows, and if there are any missing values in the dataset. Also, I was able to use the filter, mutate, and select function from the dplyr library. I used the filter function to see if there were any students with an overall grade of 100 and found out that there were 3 students. I also use the mutate function to add columns such as student’s overall grade and their grades in Letter form. Then I used the select function to select different categorical variables to create a mosaic plot. I chose this topic because I’m also a student, so it will be easier to relate to. I chose this dataset because I like the fact that they have different test scores so I thought it would be interesting to see each gender do on different kinds of tests.
According to a blog post I read called, “Gender differences on the ACT test: Boys score higher on math and science; girl score higher on English and Reading,” written by Mark J. Perry stated that high school girls tend to do slightly better in the areas of language and reading on average, while high school boys exhibit slightly greater cognitive abilities in math and science on average. In this dataset, in one of my data visualization, a boxplot shows that girls did do better in reading and writing test, while boys did better in math. In my data visualizations, I was able to see that the higher the number of students who completed the test preparation course, the higher the number of students with the letter grade A. The group of students who did complete the test preparation course receives a much lower number of the letter grade F compared to the group who did not. It’s also interesting that slightly more girls completed the test preparation course than boys. Overall, I think test preparation courses are helpful for testing taking. One thing that I wanted to show but could not get to work on was creating a mosaic plot for all my categorical variables. Since my dataset is categorical variables heavy, I thought a complex mosaic plot would work well in comparing gender, race/ethnicity, parental level of education, lunch, and test preparation.