student.mat <- read.csv("~/Mscs 150 I20/Project/Cha-Faith_Project_Folder/student-mat.csv")
student.mat$avggrade = (student.mat$G1 + student.mat$G2 + student.mat$G3) / 3
gradesavg <- student.mat %>%
group_by(sex, age) %>%
summarize(N=n(), mean=mean(avggrade)) %>%
ungroup()
gradesavg= gradesavg[-c(13,14),]
grades_sex <- gradesavg %>%
group_by(sex) %>%
summarize(N=n(), avg=mean(mean))%>%
ungroup()
ggplot(gradesavg, aes(x=factor(age), y=mean, fill=factor(sex), group=factor(sex)))+
geom_col(position="dodge", color="black")+
labs(x= "Age", y = "Average Grade", title = "Average Grade by Age and Sex")
ggplot(grades_sex, aes(x=sex, y=avg, fill=sex))+
geom_col()+
labs(x = "Sex", y = "Average Grade", title = "Average Grade by Sex")
#No difference in grades between both sexes
p = ggplot(student.mat, aes(x=avggrade)) + geom_density(aes(fill=factor(failures)), alpha=0.8) + facet_wrap(~age) +
labs(title="Average Grades: Amount of Failed Classes by Age", x="Average Grades", y="Count", caption= "Source: student.mat")
p
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
#This density plot showcases the average grades by the class and compares them to the number of times they failed a class and then facet wrapped by age. As we see in ages 21 and 22, there wasn’t enough data to be showcased and so those points were dropped. The most significant showing would be for age 17 where people who had failed at least 2 classes had an average grade of about 5 and the count was basically to the maximum of about 0.6. What is intresting is that the point for failing 2 classes and for the graph of age 16 of a point failing 3 classes with just a count of 0.2, has prominent “2 humpback points” but not as much variation in terms of average grades.The age group of 20 only had one point of students failing 2 classes but not going over the count of 0.2. This visual can tell that there’s not really a connection between the number of classes a student has failed and their average grades although there seems to be some outliers that changes the graph in some.
alcmodel <- lm(avggrade ~ Dalc + Walc + studytime + goout + absences + freetime, student.mat )
summary(alcmodel)
##
## Call:
## lm(formula = avggrade ~ Dalc + Walc + studytime + goout + absences +
## freetime, data = student.mat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.2177 -2.2942 -0.0677 2.4414 8.6864
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.310958 0.949040 10.865 < 2e-16 ***
## Dalc -0.150536 0.273953 -0.549 0.58298
## Walc 0.081020 0.203005 0.399 0.69004
## studytime 0.597047 0.228279 2.615 0.00926 **
## goout -0.565659 0.188238 -3.005 0.00283 **
## absences 0.006819 0.023248 0.293 0.76943
## freetime 0.281276 0.196304 1.433 0.15270
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.641 on 388 degrees of freedom
## Multiple R-squared: 0.0449, Adjusted R-squared: 0.03013
## F-statistic: 3.04 on 6 and 388 DF, p-value: 0.006441
alcstats= tidy(alcmodel) %>%
round_df()
alcconf <- tidy(alcmodel, conf.int = TRUE)
alcconf = subset(alcconf, term %nin% "(Intercept)") %>%
round_df()
alcanova <- anova(alcmodel)
alcfitted <- fitted(alcmodel)
alcresid <- resid(alcmodel)
alcaov <- aov(alcmodel, student.mat)
alcaovtidy <- tidy(alcaov, conf.int = TRUE) %>%
round_df()
summary(alcaov)
## Df Sum Sq Mean Sq F value Pr(>F)
## Dalc 1 28 28.31 2.136 0.14471
## Walc 1 16 15.64 1.180 0.27800
## studytime 1 72 71.53 5.396 0.02069 *
## goout 1 99 98.73 7.449 0.00664 **
## absences 1 0 0.37 0.028 0.86760
## freetime 1 27 27.21 2.053 0.15270
## Residuals 388 5143 13.25
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#values with asterisks suggest statistical significance
ggplot(alcaovtidy, aes(x=term, y=p.value))+
geom_point() + coord_flip()
## Warning: Removed 1 rows containing missing values (geom_point).
ggplot(alcconf, aes(x=reorder(term, estimate) , y=estimate,
ymin=conf.low,
ymax=conf.high))+
geom_pointrange()+
coord_flip() +
geom_hline(yintercept = 0, size = 1.4, color = "black", alpha=.5)
gooutmod <- student.mat %>%
group_by(sex, age, goout, studytime, Dalc, Walc, avggrade) %>%
summarize(N=n())%>%
summarize(averagegrade = mean(avggrade)) %>%
ungroup()
gooutmod <- mutate(gooutmod,
studytime = factor(studytime, labels=c("<2 hours", "2 to 5 hours", "5 to 10 hours", ">10 hours"), ordered = T),
Dalc = factor(Dalc, labels=c("very low", "low", "moderate", "high", "very high"), ordered = T),
Walc = factor(Walc, labels=c("very low", "low", "moderate", "high", "very high"), ordered = T))
alccolors <- c("#44cf4d", "#cfcd00","#cf4449", "#444dcf")
ggplot(gooutmod, aes(x= goout, y=averagegrade, color=factor(studytime), group=studytime))+
geom_jitter(alpha=.5)+
labs(x= "Going Out\nScale from 1(Low) to 5(High)",
y= "Average Grade",
title = "Average Grade based on amount of Going Out",
subtitle = "Size is based on weekly alcohol consumption",
color= "Study Time",
size= "Weekly Alcohol\n Consumption Scale\n from 1(Low) to 10(High)")+
geom_smooth(method="lm", se= FALSE, size=1.5)+
scale_color_manual(values = alccolors)
ggplot(gooutmod, aes(x=factor(goout), y=averagegrade, fill=factor(studytime)))+
geom_boxplot()+
labs(x= "Going Out\nScale from 1(Low) to 5(High)",
y= "Average Grade",
title = "Average Grade based on amount of Going Out",
color= "Study Time",
size= "Weekly Alcohol\n Consumption Scale\n from 1(Low) to 10(High)")+
scale_fill_manual(values = alccolors)
glance(alcaov)
## # A tibble: 1 x 11
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.0449 0.0301 3.64 3.04 0.00644 7 -1067. 2151. 2183.
## # … with 2 more variables: deviance <dbl>, df.residual <int>
alcsch <- c("#0ed9f0","#efbaf7", "#79fca9", "#f7c886", "#f2aba7" )
student.mat <- mutate(student.mat,
Dalc = factor(Dalc, labels=c("very low", "low","medium", "high", "very high"), ordered=T),
Walc = factor(Dalc, labels=c("very low", "low","medium", "high", "very high"), ordered=T))
p = ggplot(student.mat, aes(x=famsup, y=avggrade, fill=factor(Walc))) + geom_boxplot() +
labs(title="Average Grades: Family Support by Alcohol Consumption", x="Family Support", y="Average Grades", fill= "Alcohol Consumption During the Week", caption= "Source: student.mat") + scale_fill_manual(values=alcsch)
p
#I wanted to have the scale for weekday and workday alchohol be more coherent and make more sense for the graph instead of just numbers. So I mutated those 2 variables into a word scale of “very low” to “very high”. Based on the boxplot I did with just workday alcohol consumption, we can see that there is a variation of having medium alcohol consumption during the week if they have their family support and that only gives them the average in their grades. It’s a very different visual when we see for students who did not have any family support but still had a medium alchohol consumption in being that there are some varitation but not as much. This visual can tell us that whether or not a student has the support of their family or not, their intake of alcohol during the week is not that much difference for the average grades with the exception of some.
famanova = aov(avggrade~ Pstatus + Medu + Fedu + Mjob + Fjob + famsup + famrel + higher, student.mat)
famfitted = fitted(famanova)
famresid= resid(famanova)
famtidy =tidy(famanova, conf.int= TRUE) %>% round_df()
view(famtidy)
ggplot(famtidy, aes(x=p.value, y=term))+
geom_point()
## Warning: Removed 1 rows containing missing values (geom_point).
famlm = lm(avggrade~ Pstatus + Medu + Fedu + Mjob + Fjob + famsup + famrel + higher, student.mat)
famstats= tidy(famlm) %>%
round_df()
famconf <- tidy(famlm, conf.int = TRUE)
famconf = subset(famconf, term %nin% "(Intercept)") %>% round_df() # Removes Grades intercept
view(famconf)
ggplot(famconf, aes(x=term, y=estimate, ymin=conf.low, ymax=conf.high))+
geom_pointrange() + coord_flip() +
geom_hline(yintercept = 0, size=1.3, color = "black", alpha=.5)
ggplot(famconf, aes(x=reorder(term, estimate), y=estimate, ymin=conf.low, ymax=conf.high))+
geom_pointrange()+
coord_flip()+
geom_hline(yintercept = 0, size = .8, color = "black", alpha=.5)
fammod <- student.mat %>%
group_by(Pstatus,Medu,Fedu,Mjob,Fjob,famsup,famrel,higher, avggrade) %>%
summarize(N=n())%>%
summarize(avggrade = mean(avggrade)) %>%
ungroup()
fammod <- mutate(student.mat,
Fedu = factor(Fedu, labels=c("none", "primary education","5th to 9th grade", "secondary education", "higher education"), ordered=T),
Medu = factor(Medu, labels=c("none", "primary education","5th to 9th grade", "secondary education", "higher education"), ordered=T),
Mjob = factor(Mjob, labels=c("teacher", "health care related","civil services", "at home", "other"), ordered=T),
Fjob = factor(Fjob, labels=c("teacher", "health care related","civil services", "at home", "other"), ordered=T),
studytime = factor(studytime, labels=c("<2 hours", "2 to 5 hours", "5 to 10 hours", ">10 hours"), ordered = T))
ggplot(fammod, aes(x= Medu, y=avggrade, fill=famsup))+
geom_boxplot(alpha=.5, position="dodge", color="black")+
labs(y= "Average Grade",
x= "Mother's Education",
fill = "Family Support",
title= "Average Grade based on Mother's Education",
subtitle = "Separated by Family Support")+
theme(axis.text.x = element_text(angle = -10))
ggplot(fammod, aes(x= Medu, y=avggrade, fill=higher))+
geom_boxplot(alpha=.5, position="dodge", color="black")+
labs(y= "Average Grade",
x= "Mother's Education",
fill = "Want to\nPursue Higher\nEdu",
title= "Average Grade based on Mother's Education",
subtitle = "Separated by Desire to Pursue Higher Education")+
theme(axis.text.x = element_text(angle = -10))
glance(famanova)
## # A tibble: 1 x 11
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.114 0.0810 3.54 3.48 2.26e-5 15 -1053. 2137. 2201.
## # … with 2 more variables: deviance <dbl>, df.residual <int>
p = ggplot(student.mat, aes(x=reason, y=avggrade, fill=schoolsup)) + geom_boxplot() +
labs(title="Average Grades: Reason for School Choice by School Support", x="Reason for School", y="Average Grades", subtitle = "Fill color is based on school support", caption= "Source: student.mat")
p
#This visual is looking at students’ reasons for picking the school they did for their average grades and whether or not they had the support of their school. I used a boxplot to see the comparison of whether or not they had school support. For the reason of reputation and other, they had a distinct variation if they didn’t have school support and the average grades for it was above 10 and the two biggest boxplot. On the contrast, all the reasons that students had for picking their school had the lowest average grades, even if they did have school support. I find it very ironic because we would believe that students that have the support of their school would do better grade-wise but it’s not the case for this graph. I felt this kind of graph was necessary to actually see these variables between one another because we were expecting the opposite results.