library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
library(ggplot2)
data1<-data.frame(read.csv("https://raw.githubusercontent.com/Raji030/data607_project02/main/data607_project2.csv",sep=",", header = T))
data1
## Names Age Grades Gender
## 1 Sally 6 85.00000 1
## 2 Michael "7" 89.66667 0
## 3 Elizabeth "6" 92.00000 1
## 4 Anthony 7 93.50000 0
## 5 Mary 6.111 90.00010 1
## 6 Steven 7 91.00000 0
data1<-mutate(data1, Gender = case_when(Gender == 1 ~'girl',
Gender == 0 ~'boy'))
data1
## Names Age Grades Gender
## 1 Sally 6 85.00000 girl
## 2 Michael "7" 89.66667 boy
## 3 Elizabeth "6" 92.00000 girl
## 4 Anthony 7 93.50000 boy
## 5 Mary 6.111 90.00010 girl
## 6 Steven 7 91.00000 boy
data1<-data1 %>% mutate_at('Grades', round, 2)
data1
## Names Age Grades Gender
## 1 Sally 6 85.00 girl
## 2 Michael "7" 89.67 boy
## 3 Elizabeth "6" 92.00 girl
## 4 Anthony 7 93.50 boy
## 5 Mary 6.111 90.00 girl
## 6 Steven 7 91.00 boy
data1$Age[2]<-7
data1$Age[3]<-6
data1
## Names Age Grades Gender
## 1 Sally 6 85.00 girl
## 2 Michael 7 89.67 boy
## 3 Elizabeth 6 92.00 girl
## 4 Anthony 7 93.50 boy
## 5 Mary 6.111 90.00 girl
## 6 Steven 7 91.00 boy
data1$Age<-as.numeric(data1$Age)
data1
## Names Age Grades Gender
## 1 Sally 6.000 85.00 girl
## 2 Michael 7.000 89.67 boy
## 3 Elizabeth 6.000 92.00 girl
## 4 Anthony 7.000 93.50 boy
## 5 Mary 6.111 90.00 girl
## 6 Steven 7.000 91.00 boy
data1<-data1 %>% mutate_at('Age', round, 1)
data1
## Names Age Grades Gender
## 1 Sally 6.0 85.00 girl
## 2 Michael 7.0 89.67 boy
## 3 Elizabeth 6.0 92.00 girl
## 4 Anthony 7.0 93.50 boy
## 5 Mary 6.1 90.00 girl
## 6 Steven 7.0 91.00 boy
data2<-data1%>%arrange(desc(Grades))
data2
## Names Age Grades Gender
## 1 Anthony 7.0 93.50 boy
## 2 Elizabeth 6.0 92.00 girl
## 3 Steven 7.0 91.00 boy
## 4 Mary 6.1 90.00 girl
## 5 Michael 7.0 89.67 boy
## 6 Sally 6.0 85.00 girl
data3<-data1%>%summarise(average=mean(Grades))
data3
## average
## 1 90.195
data4<-data1[order(data1$Gender,-data1$Grades),]
data4
## Names Age Grades Gender
## 4 Anthony 7.0 93.50 boy
## 6 Steven 7.0 91.00 boy
## 2 Michael 7.0 89.67 boy
## 3 Elizabeth 6.0 92.00 girl
## 5 Mary 6.1 90.00 girl
## 1 Sally 6.0 85.00 girl
data1%>%ggplot(aes(x=" ",y=Grades,fill=Names))+geom_bar(stat="identity",position="dodge")+ xlab("Names") + ylab("Grades")
From the above tables, it is seen that the highest grade 93.50 is received by Anthony who is a 7 years boy. Elizabeth got the second highest grade, 92 and her grade is also the highest among all the girls’ grades and she is 6. Also, Sally got the lowest grade 85 and the average grade is 90.195. The plot is also reflecting the grades by names where it is clearly found that Anthony has got the highest grade, Elizabeth has received the seocnd highest grade and Sally has received the lowest grade.
dataset2<-read.csv("https://raw.githubusercontent.com/Raji030/data_for_project2_607/main/student_results.csv",sep=",",header=T)
dataset2
## id name phone sex.and.age test.number term.1 term.2 term.3
## 1 1 Mike 134 m_12 test 1 76 84 87
## 2 2 Linda 270 f_13 test 1 88 90 73
## 3 3 Sam 210 m_11 test 1 78 74 80
## 4 4 Esther 617 f_12 test 1 68 75 74
## 5 5 Mary 114 f_14 test 1 65 67 64
## 6 1 Mike 134 m_12 test 2 85 80 90
## 7 2 Linda 270 f_13 test 2 87 82 94
## 8 3 Sam 210 m_11 test 2 80 87 80
## 9 4 Esther 617 f_12 test 2 70 75 78
## 10 5 Mary 114 f_14 test 2 68 70 63
dataset02<-dataset2%>%separate(sex.and.age,into=c("sex","age"))
dataset02
## id name phone sex age test.number term.1 term.2 term.3
## 1 1 Mike 134 m 12 test 1 76 84 87
## 2 2 Linda 270 f 13 test 1 88 90 73
## 3 3 Sam 210 m 11 test 1 78 74 80
## 4 4 Esther 617 f 12 test 1 68 75 74
## 5 5 Mary 114 f 14 test 1 65 67 64
## 6 1 Mike 134 m 12 test 2 85 80 90
## 7 2 Linda 270 f 13 test 2 87 82 94
## 8 3 Sam 210 m 11 test 2 80 87 80
## 9 4 Esther 617 f 12 test 2 70 75 78
## 10 5 Mary 114 f 14 test 2 68 70 63
new_col_names <-
colnames(dataset02) %>%
str_replace_all("\\.", '_')
colnames(dataset02) <- new_col_names
head(dataset02,5)
## id name phone sex age test_number term_1 term_2 term_3
## 1 1 Mike 134 m 12 test 1 76 84 87
## 2 2 Linda 270 f 13 test 1 88 90 73
## 3 3 Sam 210 m 11 test 1 78 74 80
## 4 4 Esther 617 f 12 test 1 68 75 74
## 5 5 Mary 114 f 14 test 1 65 67 64
dataset03<-dataset02%>%pivot_longer(c("term_1","term_2","term_3"),names_to="term", values_to ="score" )
dataset03
## # A tibble: 30 × 8
## id name phone sex age test_number term score
## <int> <chr> <int> <chr> <chr> <chr> <chr> <int>
## 1 1 Mike 134 m 12 test 1 term_1 76
## 2 1 Mike 134 m 12 test 1 term_2 84
## 3 1 Mike 134 m 12 test 1 term_3 87
## 4 2 Linda 270 f 13 test 1 term_1 88
## 5 2 Linda 270 f 13 test 1 term_2 90
## 6 2 Linda 270 f 13 test 1 term_3 73
## 7 3 Sam 210 m 11 test 1 term_1 78
## 8 3 Sam 210 m 11 test 1 term_2 74
## 9 3 Sam 210 m 11 test 1 term_3 80
## 10 4 Esther 617 f 12 test 1 term_1 68
## # … with 20 more rows
dataset03$term <- str_extract(dataset03$term, "\\d")
dataset03$test_number<-str_extract(dataset03$test_number,"\\d" )
dataset03
## # A tibble: 30 × 8
## id name phone sex age test_number term score
## <int> <chr> <int> <chr> <chr> <chr> <chr> <int>
## 1 1 Mike 134 m 12 1 1 76
## 2 1 Mike 134 m 12 1 2 84
## 3 1 Mike 134 m 12 1 3 87
## 4 2 Linda 270 f 13 1 1 88
## 5 2 Linda 270 f 13 1 2 90
## 6 2 Linda 270 f 13 1 3 73
## 7 3 Sam 210 m 11 1 1 78
## 8 3 Sam 210 m 11 1 2 74
## 9 3 Sam 210 m 11 1 3 80
## 10 4 Esther 617 f 12 1 1 68
## # … with 20 more rows
student_avg_scores <-
dataset03%>%group_by(name,term)%>%
summarise(avg_score = mean(score))
## `summarise()` has grouped output by 'name'. You can override using the
## `.groups` argument.
head(student_avg_scores)
## # A tibble: 6 × 3
## # Groups: name [2]
## name term avg_score
## <chr> <chr> <dbl>
## 1 Esther 1 69
## 2 Esther 2 75
## 3 Esther 3 76
## 4 Linda 1 87.5
## 5 Linda 2 86
## 6 Linda 3 83.5
student_avg_scores$name[which.max(student_avg_scores$avg_score)]
## [1] "Mike"
student_avg_scores$term[which.max(student_avg_scores$avg_score)]
## [1] "3"
student_avg_scores$name[which.min(student_avg_scores$avg_score)]
## [1] "Mary"
student_avg_scores$term[which.min(student_avg_scores$avg_score)]
## [1] "3"
From the above average score table it is found that in term 3, Mike’s average score is maximum whereas Mary scored minimum in the same term. Students average scores are also depicted in the following plot.
student_avg_scores%>%ggplot(aes(x=" ",y= avg_score,fill=name))+geom_bar(stat="identity",position="dodge")+ xlab("Names") + ylab("avg_scores")
students<-data.frame(read.csv("https://raw.githubusercontent.com/Raji030/project_02data/main/StudentsPerformance.csv", sep=",", header=TRUE, stringsAsFactors=FALSE))
head(students)
## gender race.ethnicity parental.level.of.education lunch
## 1 female group B bachelor's degree standard
## 2 female group C some college standard
## 3 female group B master's degree standard
## 4 male group A associate's degree free/reduced
## 5 male group C some college standard
## 6 female group B associate's degree standard
## test.preparation.course math.score reading.score writing.score
## 1 none 72 72 74
## 2 completed 69 90 88
## 3 none 90 95 93
## 4 none 47 57 44
## 5 none 76 78 75
## 6 none 71 83 78
names(students)[2] <- "race_ethnicity"
names(students)[3] <- "parental_edu"
names(students)[5]<- "test_prep"
names(students)[6] <- "math_score"
names(students)[7] <- "reading_score"
names(students)[8] <- "writing_score"
head(students)
## gender race_ethnicity parental_edu lunch test_prep math_score
## 1 female group B bachelor's degree standard none 72
## 2 female group C some college standard completed 69
## 3 female group B master's degree standard none 90
## 4 male group A associate's degree free/reduced none 47
## 5 male group C some college standard none 76
## 6 female group B associate's degree standard none 71
## reading_score writing_score
## 1 72 74
## 2 90 88
## 3 95 93
## 4 57 44
## 5 78 75
## 6 83 78
students_math<-students[order(students$math_score, decreasing = TRUE),]
head(students_math)
## gender race_ethnicity parental_edu lunch test_prep math_score
## 150 male group E associate's degree free/reduced completed 100
## 452 female group E some college standard none 100
## 459 female group E bachelor's degree standard none 100
## 624 male group A some college standard completed 100
## 626 male group D some college standard completed 100
## 917 male group E bachelor's degree standard completed 100
## reading_score writing_score
## 150 100 93
## 452 92 97
## 459 100 100
## 624 96 86
## 626 97 99
## 917 100 100
By sorting out math scores, it is found that group E students have possessed the top three positions where two of them are female. It is also found that student in 6th position has received 100 percent in all three exams.
students_reading<-students[order(students$reading_score, decreasing = TRUE),]
head(students_reading)
## gender race_ethnicity parental_edu lunch test_prep math_score
## 107 female group D master's degree standard none 87
## 115 female group E bachelor's degree standard completed 99
## 150 male group E associate's degree free/reduced completed 100
## 166 female group C bachelor's degree standard completed 96
## 180 female group D some high school standard completed 97
## 382 male group C associate's degree standard completed 87
## reading_score writing_score
## 107 100 100
## 115 100 100
## 150 100 93
## 166 100 100
## 180 100 100
## 382 100 95
By sorting out reading scores, it is seen that top two positions are held by female students from different ethnicity groups. It is also seen that among the 6 listed data, four positions are held by female students. Moreover, student at the top most position did not take any test preparation and her parental education level is masters.
students_writing<-students[order(students$writing_score, decreasing = TRUE),]
head(students_writing)
## gender race_ethnicity parental_edu lunch test_prep math_score
## 107 female group D master's degree standard none 87
## 115 female group E bachelor's degree standard completed 99
## 166 female group C bachelor's degree standard completed 96
## 180 female group D some high school standard completed 97
## 378 female group D master's degree free/reduced completed 85
## 404 female group D high school standard completed 88
## reading_score writing_score
## 107 100 100
## 115 100 100
## 166 100 100
## 180 100 100
## 378 95 100
## 404 99 100
By sorting out writing scores, it is seen that the top six spots are held by female students.Like reading section, the student at top most position in writing did not have test preparation and her parents have master level of education too.From the three academic performances, it is found that the students from ethnicity groups D,E,and C are doing much better than other ethnicity groups.
students <- students %>%
mutate(avg_score= (math_score+ reading_score + writing_score)/3)
head(students)
## gender race_ethnicity parental_edu lunch test_prep math_score
## 1 female group B bachelor's degree standard none 72
## 2 female group C some college standard completed 69
## 3 female group B master's degree standard none 90
## 4 male group A associate's degree free/reduced none 47
## 5 male group C some college standard none 76
## 6 female group B associate's degree standard none 71
## reading_score writing_score avg_score
## 1 72 74 72.66667
## 2 90 88 82.33333
## 3 95 93 92.66667
## 4 57 44 49.33333
## 5 78 75 76.33333
## 6 83 78 77.33333
students_avg<-select(students, gender, lunch, parental_edu, avg_score)
head(students_avg)
## gender lunch parental_edu avg_score
## 1 female standard bachelor's degree 72.66667
## 2 female standard some college 82.33333
## 3 female standard master's degree 92.66667
## 4 male free/reduced associate's degree 49.33333
## 5 male standard some college 76.33333
## 6 female standard associate's degree 77.33333
students_avg<-students_avg[order(students_avg$avg_score, decreasing = TRUE),]
head(students_avg)
## gender lunch parental_edu avg_score
## 459 female standard bachelor's degree 100.00000
## 917 male standard bachelor's degree 100.00000
## 963 female standard associate's degree 100.00000
## 115 female standard bachelor's degree 99.66667
## 180 female standard some high school 99.00000
## 713 female standard some college 99.00000
ggplot(students_avg, aes(x = "", y = avg_score, fill = parental_edu)) + geom_bar(stat="identity",position="dodge") +
xlab("parental_edducation") + ylab("average_score")
ggplot(students_avg, aes(x = "", y = avg_score, fill =lunch)) + geom_bar(stat="identity",position="dodge") +
xlab("lunch_type") + ylab("average_score")
By analyzing the above tables and plots, it is seen that students
average academic performance vary about their parental level of
education and type of lunch they are receiving. Students average scores
are higher when their parents have associate and bachelor level of
education. On the other hand, students who are receiving standard lunch
are doing higher academic performances compared to those receiving
free/reduced lunch.