library(dplyr)
library(plyr)
file.name = "/Users/Audiorunner13/CUNY MSDS Course Work/R Programming Bridge/Data/student_mat.csv"
student_math <- read.csv(file=file.name, header = TRUE, sep = ",")
head(student_math, 10)
summary(student_math)
student school sex age address famsize
Min. : 1.0 Length:395 Length:395 Min. :15.0 Length:395 Length:395
1st Qu.: 99.5 Class :character Class :character 1st Qu.:16.0 Class :character Class :character
Median :198.0 Mode :character Mode :character Median :17.0 Mode :character Mode :character
Mean :198.0 Mean :16.7
3rd Qu.:296.5 3rd Qu.:18.0
Max. :395.0 Max. :22.0
Pstatus Medu Fedu Mjob Fjob reason
Length:395 Min. :0.000 Min. :0.000 Length:395 Length:395 Length:395
Class :character 1st Qu.:2.000 1st Qu.:2.000 Class :character Class :character Class :character
Mode :character Median :3.000 Median :2.000 Mode :character Mode :character Mode :character
Mean :2.749 Mean :2.522
3rd Qu.:4.000 3rd Qu.:3.000
Max. :4.000 Max. :4.000
guardian traveltime studytime failures schoolsup famsup
Length:395 Min. :1.000 Min. :1.000 Min. :0.0000 Length:395 Length:395
Class :character 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:0.0000 Class :character Class :character
Mode :character Median :1.000 Median :2.000 Median :0.0000 Mode :character Mode :character
Mean :1.448 Mean :2.035 Mean :0.3342
3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:0.0000
Max. :4.000 Max. :4.000 Max. :3.0000
paid activities nursery higher internet
Length:395 Length:395 Length:395 Length:395 Length:395
Class :character Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character Mode :character
romantic famrel freetime goout Dalc Walc
Length:395 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
Class :character 1st Qu.:4.000 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:1.000
Mode :character Median :4.000 Median :3.000 Median :3.000 Median :1.000 Median :2.000
Mean :3.944 Mean :3.235 Mean :3.109 Mean :1.481 Mean :2.291
3rd Qu.:5.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:2.000 3rd Qu.:3.000
Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
health absences G1 G2 G3
Min. :1.000 Min. : 0.000 Min. : 3.00 Min. : 0.00 Min. : 0.00
1st Qu.:3.000 1st Qu.: 0.000 1st Qu.: 8.00 1st Qu.: 9.00 1st Qu.: 8.00
Median :4.000 Median : 4.000 Median :11.00 Median :11.00 Median :11.00
Mean :3.554 Mean : 5.709 Mean :10.91 Mean :10.71 Mean :10.42
3rd Qu.:5.000 3rd Qu.: 8.000 3rd Qu.:13.00 3rd Qu.:13.00 3rd Qu.:14.00
Max. :5.000 Max. :75.000 Max. :19.00 Max. :19.00 Max. :20.00
sprintf("The mean age of math students in the dataset: %s", mean(student_math$age))
[1] "The mean age of math students in the dataset: 16.6962025316456"
sprintf("The median age of math students in the dataset: %s", median(student_math$age))
[1] "The median age of math students in the dataset: 17"
sprintf("The mean study time of math students in the dataset: %s", mean(student_math$studytime))
[1] "The mean study time of math students in the dataset: 2.03544303797468"
sprintf("The median study time of math students in the dataset: %s", median(student_math$studytime))
[1] "The median study time of math students in the dataset: 2"
sprintf("The mean final grade of math students in the dataset: %s", mean(student_math$G3))
[1] "The mean final grade of math students in the dataset: 10.4151898734177"
sprintf("The median final grade of math students in the dataset: %s", median(student_math$G3))
[1] "The median final grade of math students in the dataset: 11"
# subset data.frame, reduce the number of fields
student_math.GP_M <- student_math %>% filter(sex == 'M') %>% filter(school == 'GP')
student_math.GP_M <- student_math.GP_M[c("student","school","sex","age","Pstatus","Medu","Fedu","Mjob","Fjob","studytime","internet","G3")]
student_math.GP_F <- student_math %>% filter(sex == 'F') %>% filter(school == 'GP')
student_math.GP_F <- student_math.GP_F[c("student","school","sex","age","Pstatus","Medu","Fedu","Mjob","Fjob","studytime","internet","G3")]
student_math.GP_M <- rename(student_math.GP_M,"Student"="student","School"="school","Gender"="sex","Age"="age","ParentLivingStatus"="Pstatus","MotherEducation"="Medu","FatherEducation"="Fedu","MotherJob"="Mjob","FatherJob"="Fjob","StudyTime"="studytime","Internet"="internet","FinalGrade"="G3")
student_math.GP_F <- rename(student_math.GP_F,"Student"="student","School"="school","Gender"="sex","Age"="age","ParentLivingStatus"="Pstatus","MotherEducation"="Medu","FatherEducation"="Fedu","MotherJob"="Mjob","FatherJob"="Fjob","StudyTime"="studytime","Internet"="internet","FinalGrade"="G3")
student_math.GP_M$Gender[student_math.GP_M$Gender == "F"] <- "Female"
student_math.GP_M$Gender[student_math.GP_M$Gender == "M"] <- "Male"
student_math.GP_M$ParentLivingStatus[student_math.GP_M$ParentLivingStatus == "A"] <- "Apart"
student_math.GP_M$ParentLivingStatus[student_math.GP_M$ParentLivingStatus == "T"] <- "Together"
student_math.GP_M$MotherEducation[student_math.GP_M$MotherEducation == "0"] <- "None"
student_math.GP_M$MotherEducation[student_math.GP_M$MotherEducation == "1"] <- "Primary"
student_math.GP_M$MotherEducation[student_math.GP_M$MotherEducation == "2"] <- "Grades 5 - 9"
student_math.GP_M$MotherEducation[student_math.GP_M$MotherEducation == "3"] <- "Secondary"
student_math.GP_M$MotherEducation[student_math.GP_M$MotherEducation == "4"] <- "College"
student_math.GP_M$FatherEducation[student_math.GP_M$FatherEducation == "0"] <- "None"
student_math.GP_M$FatherEducation[student_math.GP_M$FatherEducation == "1"] <- "Primary"
student_math.GP_M$FatherEducation[student_math.GP_M$FatherEducation == "2"] <- "Grades 5 - 9"
student_math.GP_M$FatherEducation[student_math.GP_M$FatherEducation == "3"] <- "Secondary"
student_math.GP_M$FatherEducation[student_math.GP_M$FatherEducation == "4"] <- "College"
head(student_math.GP_M,10)
student_math.GP_F$Gender[student_math.GP_F$Gender == "F"] <- "Female"
student_math.GP_F$Gender[student_math.GP_F$Gender == "M"] <- "Male"
student_math.GP_F$ParentLivingStatus[student_math.GP_F$ParentLivingStatus == "A"] <- "Apart"
student_math.GP_F$ParentLivingStatus[student_math.GP_F$ParentLivingStatus == "T"] <- "Together"
student_math.GP_F$MotherEducation[student_math.GP_F$MotherEducation == "0"] <- "None"
student_math.GP_F$MotherEducation[student_math.GP_F$MotherEducation == "1"] <- "Primary"
student_math.GP_F$MotherEducation[student_math.GP_F$MotherEducation == "2"] <- "Grades 5 - 9"
student_math.GP_F$MotherEducation[student_math.GP_F$MotherEducation == "3"] <- "Secondary"
student_math.GP_F$MotherEducation[student_math.GP_F$MotherEducation == "4"] <- "College"
student_math.GP_F$FatherEducation[student_math.GP_F$FatherEducation == "0"] <- "None"
student_math.GP_F$FatherEducation[student_math.GP_F$FatherEducation == "1"] <- "Primary"
student_math.GP_F$FatherEducation[student_math.GP_F$FatherEducation == "2"] <- "Grades 5 - 9"
student_math.GP_F$FatherEducation[student_math.GP_F$FatherEducation == "3"] <- "Secondary"
student_math.GP_F$FatherEducation[student_math.GP_F$FatherEducation == "4"] <- "College"
head(student_math.GP_F,10)
summary(student_math.GP_M)
Student School Gender Age ParentLivingStatus MotherEducation
Min. : 6.00 Length:166 Length:166 Min. :15.00 Length:166 Length:166
1st Qu.: 85.25 Class :character Class :character 1st Qu.:15.00 Class :character Class :character
Median :163.50 Mode :character Mode :character Median :16.00 Mode :character Mode :character
Mean :165.64 Mean :16.46
3rd Qu.:247.75 3rd Qu.:17.00
Max. :348.00 Max. :22.00
FatherEducation MotherJob FatherJob StudyTime Internet FinalGrade
Length:166 Length:166 Length:166 Min. :1.000 Length:166 Min. : 0.00
Class :character Class :character Class :character 1st Qu.:1.000 Class :character 1st Qu.: 9.00
Mode :character Mode :character Mode :character Median :2.000 Mode :character Median :11.00
Mean :1.801 Mean :11.06
3rd Qu.:2.000 3rd Qu.:14.00
Max. :4.000 Max. :20.00
summary(student_math.GP_F)
Student School Gender Age ParentLivingStatus MotherEducation
Min. : 1.0 Length:183 Length:183 Min. :15.00 Length:183 Length:183
1st Qu.: 89.5 Class :character Class :character 1st Qu.:16.00 Class :character Class :character
Median :192.0 Mode :character Mode :character Median :16.00 Mode :character Mode :character
Mean :183.5 Mean :16.58
3rd Qu.:281.0 3rd Qu.:17.00
Max. :349.0 Max. :19.00
FatherEducation MotherJob FatherJob StudyTime Internet FinalGrade
Length:183 Length:183 Length:183 Min. :1.000 Length:183 Min. : 0.000
Class :character Class :character Class :character 1st Qu.:2.000 Class :character 1st Qu.: 8.000
Mode :character Mode :character Mode :character Median :2.000 Mode :character Median :11.000
Mean :2.301 Mean : 9.973
3rd Qu.:3.000 3rd Qu.:13.000
Max. :4.000 Max. :19.000
sprintf("The mean and median for age, study time, and Final Grade for Math students at both Gabriel Pereira (GP)")
[1] "The mean and median for age, study time, and Final Grade for Math students at both Gabriel Pereira (GP)"
sprintf("The mean age of GP male math students: %s,", mean(student_math.GP_M$Age))
[1] "The mean age of GP male math students: 16.4578313253012,"
sprintf("The mean age of GP female math students: %s", mean(student_math.GP_F$Age))
[1] "The mean age of GP female math students: 16.5792349726776"
sprintf("The median age of GP male math students: %s", median(student_math.GP_M$Age))
[1] "The median age of GP male math students: 16"
sprintf("The median age of GP female math students: %s", median(student_math.GP_F$Age))
[1] "The median age of GP female math students: 16"
It is not surprising that the mean and median age for the male and female students is the same since they are attending a school with students there own age.
sprintf("The mean study time of GP male math students: %s", mean(student_math.GP_M$StudyTime))
[1] "The mean study time of GP male math students: 1.80120481927711"
sprintf("The mean study time of GP female math students: %s", mean(student_math.GP_F$StudyTime))
[1] "The mean study time of GP female math students: 2.30054644808743"
sprintf("The median study time of GP male math students: %s", median(student_math.GP_M$StudyTime))
[1] "The median study time of GP male math students: 2"
sprintf("The median study time of GP female math students: %s", median(student_math.GP_F$StudyTime))
[1] "The median study time of GP female math students: 2"
While the median study time for both male and female students is equal at 2 hours of study time, looking at the mean studytime between male and female, females tend to study 30 minutes more on average.
sprintf("The mean final grade of GP male math students: %s", mean(student_math.GP_M$FinalGrade))
[1] "The mean final grade of GP male math students: 11.0602409638554"
sprintf("The mean final grade of GP female math students: %s", mean(student_math.GP_F$FinalGrade))
[1] "The mean final grade of GP female math students: 9.97267759562842"
sprintf("The median final grade of GP male math students: %s", median(student_math.GP_M$FinalGrade))
[1] "The median final grade of GP male math students: 11"
sprintf("The median final grade of GP female math students: %s", median(student_math.GP_F$FinalGrade))
[1] "The median final grade of GP female math students: 11"
Here again, the median final grade for male and female students is equal at 11. However, the mean (average) final grade for males is 1.09 points higher than that of females even though it appears that females study a little more.
# Final Math Grade at Gabriel Pereira school Scatterplot
ggplot(student_math.GP_M, aes(x = Student, y = FinalGrade)) +
geom_point(color = "#0099f9") + labs(x = "Male Math Students", y = "Final Grade")
At first glance it appears that both scatterplots show almost identical results. However, if you look closely you will notice that in the female scatterplot below that more grades fall between the 5 and 10 lines than the males and also more females have more points on the zero line. This will certainly explain why the female mean final grade is lower than the male mean final grade.
# Final Math Grade at Gabriel Pereira school Scatterplot
ggplot(student_math.GP_F, aes(x = Student, y = FinalGrade)) +
geom_point(color = "#0099f9") + labs(x = "Female Math Students", y = "Final Grade")
boxplot(FinalGrade ~ StudyTime, data = student_math.GP_M, xlab = "Study Time (in hours)",
ylab = "Final Grade", main = "Male Student Math Grade Data",
notch = FALSE, varwidth = TRUE, col = c("blue","green","yellow","purple"))
The boxplots can show a more clearly the median, mean and any outliers in the female and male final grade data broken down by the amount of study time for each by getting a little into the weeds. In the first boxplot where studytime is one hour here are some points. 1. Median for both is 11. 2. More males scored with one hour of study time than females. 3. The lowest score of 0 for males and 6 for females. In study time hour you see 1. an outlier at 0. 2. Median of 11. 3. For those that spent 2 hours studying more females than males final grades are represented by the box.
boxplot(FinalGrade ~ StudyTime, data = student_math.GP_F, xlab = "Study Time (in hours)",
ylab = "Final Grade", main = "Female Student Math Grade Data",
notch = FALSE, varwidth = TRUE, col = c("blue","green","yellow","purple"))
hist(student_math.GP_M$FinalGrade, xlab = "Math Final Grade 0 to 20 - Male", main = "Male Final Grade", prob = TRUE)
lines(df, lwd=1, col = "blue")
In these histograms with density curves, one can see that more females scored between 0 and 1 for a final grade than males did. However, it appears that the total percentage of females and males final grade score that lay between 7 and 16 is just about equal. You can see this in the two scatterplots above as well. The density curve supports this as well with the most density being at Final Grade 11 which is equal to the median for both datasets.
hist(student_math.GP_F$FinalGrade, xlab = "Math Final Grade 0 to 20 - Female", main = "Female Final Grade", prob = TRUE)
df <- density(student_math.GP_F$FinalGrade)
lines(df, lwd=1, col = "red")
densityplot( ~ student_math.GP_M$FinalGrade, data = student_math, xlab = "Math Final Grade - Male")
The density plot is my favorite because it immediately shows just how close to identical is the performance of male and females students in this math class. If it were not for the labels one could not tell which is the female graph and which is the male. These density plots show exactly what is explained in the historgram with density curve above.
densityplot( ~ student_math.GP_F$FinalGrade, data = student_math, xlab = "Math Final Grade - Female")
BONUS - place the original .csv in a github file and have R read from the link. This will be a very useful skill as you progress in your data science education and career.
library(RCurl)
filename <- getURL("https://raw.githubusercontent.com/audiorunner13/Masters-Coursework/main/student_mat.csv")
student_mat_bonus <- read.csv(text = filename)
head(student_mat_bonus, 10)