library(readr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(stringr)
library(ggplot2)

Dataset1: This dataset is taken from discussion board (posted by Sangeetha Sashikumar). After collecting the dataset I put it my github account. With this dataset I will analyse the stuents grades based on their genders and ages.

data1<-data.frame(read.csv("https://raw.githubusercontent.com/Raji030/data607_project02/main/data607_project2.csv",sep=",", header = T))
data1

##       Names   Age   Grades Gender
## 1     Sally     6 85.00000      1
## 2   Michael   "7" 89.66667      0
## 3 Elizabeth   "6" 92.00000      1
## 4   Anthony     7 93.50000      0
## 5      Mary 6.111 90.00010      1
## 6    Steven     7 91.00000      0

Transforming Gender column’s numeric value into characters, male and female

data1<-mutate(data1, Gender = case_when(Gender == 1 ~'girl',
                                  Gender == 0 ~'boy'))
data1

##       Names   Age   Grades Gender
## 1     Sally     6 85.00000   girl
## 2   Michael   "7" 89.66667    boy
## 3 Elizabeth   "6" 92.00000   girl
## 4   Anthony     7 93.50000    boy
## 5      Mary 6.111 90.00010   girl
## 6    Steven     7 91.00000    boy

Rounding Grades column numeric values to 2 decimal points

data1<-data1 %>% mutate_at('Grades', round, 2)
data1

##       Names   Age Grades Gender
## 1     Sally     6  85.00   girl
## 2   Michael   "7"  89.67    boy
## 3 Elizabeth   "6"  92.00   girl
## 4   Anthony     7  93.50    boy
## 5      Mary 6.111  90.00   girl
## 6    Steven     7  91.00    boy

Transforming Age clomun character values to numeric values and rounding the coulmn values in 1 decimel point

data1$Age[2]<-7
data1$Age[3]<-6
data1

##       Names   Age Grades Gender
## 1     Sally     6  85.00   girl
## 2   Michael     7  89.67    boy
## 3 Elizabeth     6  92.00   girl
## 4   Anthony     7  93.50    boy
## 5      Mary 6.111  90.00   girl
## 6    Steven     7  91.00    boy

data1$Age<-as.numeric(data1$Age)
data1

##       Names   Age Grades Gender
## 1     Sally 6.000  85.00   girl
## 2   Michael 7.000  89.67    boy
## 3 Elizabeth 6.000  92.00   girl
## 4   Anthony 7.000  93.50    boy
## 5      Mary 6.111  90.00   girl
## 6    Steven 7.000  91.00    boy

data1<-data1 %>% mutate_at('Age', round, 1)
data1

##       Names Age Grades Gender
## 1     Sally 6.0  85.00   girl
## 2   Michael 7.0  89.67    boy
## 3 Elizabeth 6.0  92.00   girl
## 4   Anthony 7.0  93.50    boy
## 5      Mary 6.1  90.00   girl
## 6    Steven 7.0  91.00    boy

Analyzing the dataset to see gender wise highest grades, names and ages

data2<-data1%>%arrange(desc(Grades))
data2

##       Names Age Grades Gender
## 1   Anthony 7.0  93.50    boy
## 2 Elizabeth 6.0  92.00   girl
## 3    Steven 7.0  91.00    boy
## 4      Mary 6.1  90.00   girl
## 5   Michael 7.0  89.67    boy
## 6     Sally 6.0  85.00   girl

data3<-data1%>%summarise(average=mean(Grades))
data3

##   average
## 1  90.195

data4<-data1[order(data1$Gender,-data1$Grades),]
data4

##       Names Age Grades Gender
## 4   Anthony 7.0  93.50    boy
## 6    Steven 7.0  91.00    boy
## 2   Michael 7.0  89.67    boy
## 3 Elizabeth 6.0  92.00   girl
## 5      Mary 6.1  90.00   girl
## 1     Sally 6.0  85.00   girl

data1%>%ggplot(aes(x=" ",y=Grades,fill=Names))+geom_bar(stat="identity",position="dodge")+ xlab("Names") + ylab("Grades")

From the above tables, it is seen that the highest grade 93.50 is received by Anthony who is a 7 years boy. Elizabeth got the second highest grade, 92 and her grade is also the highest among all the girls’ grades and she is 6. Also, Sally got the lowest grade 85 and the average grade is 90.195. The plot is also reflecting the grades by names where it is clearly found that Anthony has got the highest grade, Elizabeth has received the seocnd highest grade and Sally has received the lowest grade.

Dataset2: This dataset is taken from discussion board (posted by Jhalak Das). After collecting the dataset I put it on my github account. This dataset will be analyzed to get average test scores of students.

dataset2<-read.csv("https://raw.githubusercontent.com/Raji030/data_for_project2_607/main/student_results.csv",sep=",",header=T)
dataset2

##    id   name phone sex.and.age test.number term.1 term.2 term.3
## 1   1   Mike   134        m_12      test 1     76     84     87
## 2   2  Linda   270        f_13      test 1     88     90     73
## 3   3    Sam   210        m_11      test 1     78     74     80
## 4   4 Esther   617        f_12      test 1     68     75     74
## 5   5   Mary   114        f_14      test 1     65     67     64
## 6   1   Mike   134        m_12      test 2     85     80     90
## 7   2  Linda   270        f_13      test 2     87     82     94
## 8   3    Sam   210        m_11      test 2     80     87     80
## 9   4 Esther   617        f_12      test 2     70     75     78
## 10  5   Mary   114        f_14      test 2     68     70     63

Separating sex.and.age column into two columns, sex and age

dataset02<-dataset2%>%separate(sex.and.age,into=c("sex","age"))
dataset02

##    id   name phone sex age test.number term.1 term.2 term.3
## 1   1   Mike   134   m  12      test 1     76     84     87
## 2   2  Linda   270   f  13      test 1     88     90     73
## 3   3    Sam   210   m  11      test 1     78     74     80
## 4   4 Esther   617   f  12      test 1     68     75     74
## 5   5   Mary   114   f  14      test 1     65     67     64
## 6   1   Mike   134   m  12      test 2     85     80     90
## 7   2  Linda   270   f  13      test 2     87     82     94
## 8   3    Sam   210   m  11      test 2     80     87     80
## 9   4 Esther   617   f  12      test 2     70     75     78
## 10  5   Mary   114   f  14      test 2     68     70     63

Renaming column by removing dot character

new_col_names <- 
  colnames(dataset02) %>% 
    str_replace_all("\\.", '_')

colnames(dataset02) <- new_col_names
head(dataset02,5)

##   id   name phone sex age test_number term_1 term_2 term_3
## 1  1   Mike   134   m  12      test 1     76     84     87
## 2  2  Linda   270   f  13      test 1     88     90     73
## 3  3    Sam   210   m  11      test 1     78     74     80
## 4  4 Esther   617   f  12      test 1     68     75     74
## 5  5   Mary   114   f  14      test 1     65     67     64

Pivoting three term columns into a single term column

dataset03<-dataset02%>%pivot_longer(c("term_1","term_2","term_3"),names_to="term", values_to ="score" )
dataset03

## # A tibble: 30 × 8
##       id name   phone sex   age   test_number term   score
##    <int> <chr>  <int> <chr> <chr> <chr>       <chr>  <int>
##  1     1 Mike     134 m     12    test 1      term_1    76
##  2     1 Mike     134 m     12    test 1      term_2    84
##  3     1 Mike     134 m     12    test 1      term_3    87
##  4     2 Linda    270 f     13    test 1      term_1    88
##  5     2 Linda    270 f     13    test 1      term_2    90
##  6     2 Linda    270 f     13    test 1      term_3    73
##  7     3 Sam      210 m     11    test 1      term_1    78
##  8     3 Sam      210 m     11    test 1      term_2    74
##  9     3 Sam      210 m     11    test 1      term_3    80
## 10     4 Esther   617 f     12    test 1      term_1    68
## # … with 20 more rows

Removing extra terms from column valuses

dataset03$term <- str_extract(dataset03$term, "\\d")
dataset03$test_number<-str_extract(dataset03$test_number,"\\d" )
dataset03

## # A tibble: 30 × 8
##       id name   phone sex   age   test_number term  score
##    <int> <chr>  <int> <chr> <chr> <chr>       <chr> <int>
##  1     1 Mike     134 m     12    1           1        76
##  2     1 Mike     134 m     12    1           2        84
##  3     1 Mike     134 m     12    1           3        87
##  4     2 Linda    270 f     13    1           1        88
##  5     2 Linda    270 f     13    1           2        90
##  6     2 Linda    270 f     13    1           3        73
##  7     3 Sam      210 m     11    1           1        78
##  8     3 Sam      210 m     11    1           2        74
##  9     3 Sam      210 m     11    1           3        80
## 10     4 Esther   617 f     12    1           1        68
## # … with 20 more rows

Finding students’ average scores in different terms

student_avg_scores <-
  dataset03%>%group_by(name,term)%>%
      summarise(avg_score = mean(score))

## `summarise()` has grouped output by 'name'. You can override using the
## `.groups` argument.

head(student_avg_scores)

## # A tibble: 6 × 3
## # Groups:   name [2]
##   name   term  avg_score
##   <chr>  <chr>     <dbl>
## 1 Esther 1          69  
## 2 Esther 2          75  
## 3 Esther 3          76  
## 4 Linda  1          87.5
## 5 Linda  2          86  
## 6 Linda  3          83.5

student_avg_scores$name[which.max(student_avg_scores$avg_score)]

## [1] "Mike"

student_avg_scores$term[which.max(student_avg_scores$avg_score)]

## [1] "3"

student_avg_scores$name[which.min(student_avg_scores$avg_score)]

## [1] "Mary"

student_avg_scores$term[which.min(student_avg_scores$avg_score)]

## [1] "3"

From the above average score table it is found that in term 3, Mike’s average score is maximum whereas Mary scored minimum in the same term. Students average scores are also depicted in the following plot.

student_avg_scores%>%ggplot(aes(x=" ",y= avg_score,fill=name))+geom_bar(stat="identity",position="dodge")+ xlab("Names") + ylab("avg_scores")

Dataset3: This dataset reflects the students performances in three different exams. Here, I examine students academic performances based on their gender, ethnicity, test preparation, parental level of education and type of lunch they are receiving. After collecting the dataset from kaggle.com, i put it in my github account.

students<-data.frame(read.csv("https://raw.githubusercontent.com/Raji030/project_02data/main/StudentsPerformance.csv", sep=",", header=TRUE, stringsAsFactors=FALSE))

head(students)

##   gender race.ethnicity parental.level.of.education        lunch
## 1 female        group B           bachelor's degree     standard
## 2 female        group C                some college     standard
## 3 female        group B             master's degree     standard
## 4   male        group A          associate's degree free/reduced
## 5   male        group C                some college     standard
## 6 female        group B          associate's degree     standard
##   test.preparation.course math.score reading.score writing.score
## 1                    none         72            72            74
## 2               completed         69            90            88
## 3                    none         90            95            93
## 4                    none         47            57            44
## 5                    none         76            78            75
## 6                    none         71            83            78

Renaming few columns

names(students)[2] <- "race_ethnicity"
names(students)[3] <- "parental_edu"
names(students)[5]<- "test_prep"
names(students)[6] <- "math_score"
names(students)[7] <- "reading_score"
names(students)[8] <- "writing_score"

head(students)

##   gender race_ethnicity       parental_edu        lunch test_prep math_score
## 1 female        group B  bachelor's degree     standard      none         72
## 2 female        group C       some college     standard completed         69
## 3 female        group B    master's degree     standard      none         90
## 4   male        group A associate's degree free/reduced      none         47
## 5   male        group C       some college     standard      none         76
## 6 female        group B associate's degree     standard      none         71
##   reading_score writing_score
## 1            72            74
## 2            90            88
## 3            95            93
## 4            57            44
## 5            78            75
## 6            83            78

Sorting out students’ math,writing and reading scores

students_math<-students[order(students$math_score, decreasing = TRUE),]
head(students_math)

##     gender race_ethnicity       parental_edu        lunch test_prep math_score
## 150   male        group E associate's degree free/reduced completed        100
## 452 female        group E       some college     standard      none        100
## 459 female        group E  bachelor's degree     standard      none        100
## 624   male        group A       some college     standard completed        100
## 626   male        group D       some college     standard completed        100
## 917   male        group E  bachelor's degree     standard completed        100
##     reading_score writing_score
## 150           100            93
## 452            92            97
## 459           100           100
## 624            96            86
## 626            97            99
## 917           100           100

By sorting out math scores, it is found that group E students have possessed the top three positions where two of them are female. It is also found that student in 6th position has received 100 percent in all three exams.

students_reading<-students[order(students$reading_score, decreasing = TRUE),]
head(students_reading)

##     gender race_ethnicity       parental_edu        lunch test_prep math_score
## 107 female        group D    master's degree     standard      none         87
## 115 female        group E  bachelor's degree     standard completed         99
## 150   male        group E associate's degree free/reduced completed        100
## 166 female        group C  bachelor's degree     standard completed         96
## 180 female        group D   some high school     standard completed         97
## 382   male        group C associate's degree     standard completed         87
##     reading_score writing_score
## 107           100           100
## 115           100           100
## 150           100            93
## 166           100           100
## 180           100           100
## 382           100            95

By sorting out reading scores, it is seen that top two positions are held by female students from different ethnicity groups. It is also seen that among the 6 listed data, four positions are held by female students. Moreover, student at the top most position did not take any test preparation and her parental education level is masters.

students_writing<-students[order(students$writing_score, decreasing = TRUE),]
head(students_writing)

##     gender race_ethnicity      parental_edu        lunch test_prep math_score
## 107 female        group D   master's degree     standard      none         87
## 115 female        group E bachelor's degree     standard completed         99
## 166 female        group C bachelor's degree     standard completed         96
## 180 female        group D  some high school     standard completed         97
## 378 female        group D   master's degree free/reduced completed         85
## 404 female        group D       high school     standard completed         88
##     reading_score writing_score
## 107           100           100
## 115           100           100
## 166           100           100
## 180           100           100
## 378            95           100
## 404            99           100

By sorting out writing scores, it is seen that the top six spots are held by female students.Like reading section, the student at top most position in writing did not have test preparation and her parents have master level of education too.From the three academic performances, it is found that the students from ethnicity groups D,E,and C are doing much better than other ethnicity groups.

Creating average score column and attaching it to the data frame

students <- students %>%
    mutate(avg_score= (math_score+ reading_score + writing_score)/3)

head(students)

##   gender race_ethnicity       parental_edu        lunch test_prep math_score
## 1 female        group B  bachelor's degree     standard      none         72
## 2 female        group C       some college     standard completed         69
## 3 female        group B    master's degree     standard      none         90
## 4   male        group A associate's degree free/reduced      none         47
## 5   male        group C       some college     standard      none         76
## 6 female        group B associate's degree     standard      none         71
##   reading_score writing_score avg_score
## 1            72            74  72.66667
## 2            90            88  82.33333
## 3            95            93  92.66667
## 4            57            44  49.33333
## 5            78            75  76.33333
## 6            83            78  77.33333

subsetting dataframe

students_avg<-select(students, gender, lunch, parental_edu, avg_score)
head(students_avg)

##   gender        lunch       parental_edu avg_score
## 1 female     standard  bachelor's degree  72.66667
## 2 female     standard       some college  82.33333
## 3 female     standard    master's degree  92.66667
## 4   male free/reduced associate's degree  49.33333
## 5   male     standard       some college  76.33333
## 6 female     standard associate's degree  77.33333

Ordering avg score

students_avg<-students_avg[order(students_avg$avg_score, decreasing = TRUE),]
head(students_avg)

##     gender    lunch       parental_edu avg_score
## 459 female standard  bachelor's degree 100.00000
## 917   male standard  bachelor's degree 100.00000
## 963 female standard associate's degree 100.00000
## 115 female standard  bachelor's degree  99.66667
## 180 female standard   some high school  99.00000
## 713 female standard       some college  99.00000

Polotting average sccore against parental education level

ggplot(students_avg, aes(x = "", y = avg_score, fill = parental_edu)) +  geom_bar(stat="identity",position="dodge") + 
          xlab("parental_edducation") + ylab("average_score")

ggplot(students_avg, aes(x = "", y = avg_score, fill =lunch)) +  geom_bar(stat="identity",position="dodge") + 
          xlab("lunch_type") + ylab("average_score")

By analyzing the above tables and plots, it is seen that students average academic performance vary about their parental level of education and type of lunch they are receiving. Students average scores are higher when their parents have associate and bachelor level of education. On the other hand, students who are receiving standard lunch are doing higher academic performances compared to those receiving free/reduced lunch.

project_02_data_607

Mahmud Hasan Al Raji

2022-10-09