knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(echo = TRUE)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
library(ggplot2)
library(stringr)
urlfile="https://raw.githubusercontent.com/Nhodgkinson/DATA-607-P2/main/student_results.csv"

srpdata<-read_csv(url(urlfile))
## Rows: 10 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): name, sex and age, test number
## dbl (5): id, phone, term 1, term 2, term 3
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Tidying

The data is currently in a wide format and we need to tidy it up. I want gather the terms into one column and create a new column called score with the current spread out values under each term. Then we need to look at cleaning up some of the columns, ensuring they are the correct data types and so forth.

srpdata
## # A tibble: 10 × 8
##       id name   phone `sex and age` `test number` `term 1` `term 2` `term 3`
##    <dbl> <chr>  <dbl> <chr>         <chr>            <dbl>    <dbl>    <dbl>
##  1     1 Mike     134 m_12          test 1              76       84       87
##  2     2 Linda    270 f_13          test 1              88       90       73
##  3     3 Sam      210 m_11          test 1              78       74       80
##  4     4 Esther   617 f_12          test 1              68       75       74
##  5     5 Mary     114 f_14          test 1              65       67       64
##  6     1 Mike     134 m_12          test 2              85       80       90
##  7     2 Linda    270 f_13          test 2              87       82       94
##  8     3 Sam      210 m_11          test 2              80       87       80
##  9     4 Esther   617 f_12          test 2              70       75       78
## 10     5 Mary     114 f_14          test 2              68       70       63
srdf<-gather(srpdata, "Term", "Score", 6:8)#Gathering the term columns and creating a new column for the score observations

srdf$id <- as.character(srdf$id) #id is not a numeric value in this instance so we want to change it to a chr value
 
Sex <- str_extract(srdf$`sex and age`, "\\D") #the next two steps remove the non digit character and makes a new column called "sex". Then I want to gather all digit characters plus any otherr digit potentially following the first digit and create a new column "Age"
srdf$Sex<-Sex

Age <- str_extract(srdf$`sex and age`, "\\d+")
srdf$Age <-Age

srdf<-subset(srdf, select=-c(4)) #removing "sex and age" from the df

srdf<-srdf[,c(1,2,7,8,3:6)] #reordering columns

srdf<-srdf %>%
  arrange(id)

Graph

I want to see how individual test 1 scores compare to the avg score test 1 score.

#creating a df to graph
t1df<-srdf %>%
  mutate(`Avg Score` = sum(mean(Score))) %>%
  arrange(Score)

 ggplot(t1df,aes(Score))+geom_histogram(aes(color = `test number`, fill = `test number`), 
                position = "identity", bins = 5, alpha = 0.4) +
  scale_color_manual(values = c("#00AFBB", "#E7B800")) +
  scale_fill_manual(values = c("#00AFBB", "#E7B800"))

#creating a df to graph 
t2df<- t1df%>%
  group_by(Term)  %>%
  subset(select=c(6:8))%>%
  mutate(`Avg Score` = sum(mean(Score)))

 ggplot(t2df,aes(Score))+geom_histogram(aes(color = Term, fill = Term), 
                position = "identity", bins = 5, alpha = 0.4) +
  scale_color_manual(values = c("#CC0000", "#006600", "#669999")) +
  scale_fill_manual(values = c("#CC0000", "#006600", "#669999"))

ggplot(t2df, aes(x = Score, fill = Term)) +
    geom_bar() +
    theme_classic()

##Observations Students tended to score higher on test 2 according to the above graph. In the second graph I wanted to see how scores were distributed by term. Interestingly the distribution of scores for the third term were spread out while term 1 and 2 saw a similar peak. I made a second graph showing the same findings but in a bar chart because the fill for the second histogram is a little hard to read.