knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(echo = TRUE)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
library(ggplot2)
library(stringr)
urlfile="https://raw.githubusercontent.com/Nhodgkinson/DATA-607-P2/main/student_results.csv"
srpdata<-read_csv(url(urlfile))
## Rows: 10 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): name, sex and age, test number
## dbl (5): id, phone, term 1, term 2, term 3
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
The data is currently in a wide format and we need to tidy it up. I want gather the terms into one column and create a new column called score with the current spread out values under each term. Then we need to look at cleaning up some of the columns, ensuring they are the correct data types and so forth.
srpdata
## # A tibble: 10 × 8
## id name phone `sex and age` `test number` `term 1` `term 2` `term 3`
## <dbl> <chr> <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1 Mike 134 m_12 test 1 76 84 87
## 2 2 Linda 270 f_13 test 1 88 90 73
## 3 3 Sam 210 m_11 test 1 78 74 80
## 4 4 Esther 617 f_12 test 1 68 75 74
## 5 5 Mary 114 f_14 test 1 65 67 64
## 6 1 Mike 134 m_12 test 2 85 80 90
## 7 2 Linda 270 f_13 test 2 87 82 94
## 8 3 Sam 210 m_11 test 2 80 87 80
## 9 4 Esther 617 f_12 test 2 70 75 78
## 10 5 Mary 114 f_14 test 2 68 70 63
srdf<-gather(srpdata, "Term", "Score", 6:8)#Gathering the term columns and creating a new column for the score observations
srdf$id <- as.character(srdf$id) #id is not a numeric value in this instance so we want to change it to a chr value
Sex <- str_extract(srdf$`sex and age`, "\\D") #the next two steps remove the non digit character and makes a new column called "sex". Then I want to gather all digit characters plus any otherr digit potentially following the first digit and create a new column "Age"
srdf$Sex<-Sex
Age <- str_extract(srdf$`sex and age`, "\\d+")
srdf$Age <-Age
srdf<-subset(srdf, select=-c(4)) #removing "sex and age" from the df
srdf<-srdf[,c(1,2,7,8,3:6)] #reordering columns
srdf<-srdf %>%
arrange(id)
I want to see how individual test 1 scores compare to the avg score test 1 score.
#creating a df to graph
t1df<-srdf %>%
mutate(`Avg Score` = sum(mean(Score))) %>%
arrange(Score)
ggplot(t1df,aes(Score))+geom_histogram(aes(color = `test number`, fill = `test number`),
position = "identity", bins = 5, alpha = 0.4) +
scale_color_manual(values = c("#00AFBB", "#E7B800")) +
scale_fill_manual(values = c("#00AFBB", "#E7B800"))
#creating a df to graph
t2df<- t1df%>%
group_by(Term) %>%
subset(select=c(6:8))%>%
mutate(`Avg Score` = sum(mean(Score)))
ggplot(t2df,aes(Score))+geom_histogram(aes(color = Term, fill = Term),
position = "identity", bins = 5, alpha = 0.4) +
scale_color_manual(values = c("#CC0000", "#006600", "#669999")) +
scale_fill_manual(values = c("#CC0000", "#006600", "#669999"))
ggplot(t2df, aes(x = Score, fill = Term)) +
geom_bar() +
theme_classic()
##Observations Students tended to score higher on test 2 according to the above graph. In the second graph I wanted to see how scores were distributed by term. Interestingly the distribution of scores for the third term were spread out while term 1 and 2 saw a similar peak. I made a second graph showing the same findings but in a bar chart because the fill for the second histogram is a little hard to read.