Student Performance Dataset

Service as a Freelancer

If you want hire me as a freelancer please use the following links.

For Fiverr https://www.fiverr.com/dataanalyst_85
For Upwork https://www.upwork.com/freelancers/~017667e5ef4347dcb4

Analysis

data <- read.csv("student-por.csv")
data <- data[complete.cases(data),]

data$Talc <- data$Dalc + data$Walc
data$Tedu <- data$Fedu + data$Medu


cols <- c("school","sex","age","address","famsize","Pstatus",
          "Medu","Fedu","Mjob","Fjob","reason","nursery","internet")


num<-dplyr::select(data, age, Medu, Fedu, traveltime, studytime, failures, famrel, freetime, goout,Dalc, Walc,health,absences, G1,G2,G3)
data$AvgG <- (data$G1 + data$G2 + data$G3)/3


#Histograms of variables 
multi.hist(num, freq=F, dcol = "red", 
           dlty=c("dotted", "solid"))

#Average grade of male and female
ggplot(data, aes(x = AvgG ,fill=sex))+
   geom_histogram()+
   labs(title = "Average Grade")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Correlation of variabes
#res<-Hmisc::rcorr(as.matrix(num))
#corrplot(num, type="upper", order="hclust")

#Average grade of students
ggplot(data, aes(x=sex, y=AvgG, fill=sex)) + 
   geom_boxplot() + 
   theme_classic() + 
   labs(title="Average grade performance of Males vs Female", x="Gender", y = "Average Grade")

# 1. Does the cumulative consumption of alcohol have normal distribution by gender?
p1 <- ggplot(data, aes(sample=Talc, color=school))+stat_qq()+stat_qq_line()+ theme(legend.position='none')+labs(x='theoretical quantile', y='quantile alcohol consumption')
p2 <- ggplot(data, aes(Talc, fill=factor(sex), color=factor(sex))) + geom_density() + facet_grid(factor(sex) ~ .) + theme(legend.position='none') + labs(x='density', y=NULL)
grid.arrange(p1, p2, ncol=2)

# 1. Does the Average grade have normal distribution by gender?
p1 <- ggplot(data, aes(sample=AvgG, color=school))+stat_qq()+stat_qq_line()+ theme(legend.position='none')+labs(x='theoretical quantile', y='quantile Average Grade')
p2 <- ggplot(data, aes(AvgG, fill=factor(sex), color=factor(sex))) + geom_density() + facet_grid(factor(sex) ~ .) + theme(legend.position='none') + labs(x='density', y=NULL)
grid.arrange(p1, p2, ncol=2)

#=============================================
# 2.1 What is the total count of workday Very High alcohol consumers (5) and Very low alcohol consumers (1) in each Gender?
data %>% filter(Dalc %in% range(Dalc)) %>% group_by(sex, Dalc) %>% summarise(counts = length(Dalc))

## `summarise()` regrouping output by 'sex' (override with `.groups` argument)

## # A tibble: 4 x 3
## # Groups:   sex [2]
##   sex    Dalc counts
##   <chr> <int>  <int>
## 1 F         1    305
## 2 F         5      2
## 3 M         1    146
## 4 M         5     15

# 3.  Does consumption of alcohol depend on number of past class failures?
ggplot(data, aes(x=failures, y=Talc, color=factor(failures))) + geom_boxplot() + geom_jitter(width=.2)+ theme(legend.position='none') + labs(x='number of class faiures', y='total alcohol consumption') + facet_wrap(factor(data$school))

# 3.  Does consumption of alcohol depend on number of past class failures?
ggplot(data, aes(x=failures, y=Talc, color=factor(failures))) + 
   geom_boxplot() + 
   geom_jitter(width=.2)+ 
   theme(legend.position='none') + 
   labs(x='number of class faiures', y='total alcohol consumption') + 
   facet_wrap(factor(data$school))

# 3.  Does consumption of alcohol depend on gender failures?
ggplot(data, aes(x=failures, y=Talc, color=factor(failures))) + 
   geom_boxplot() + 
   geom_jitter(width=.2)+ 
   theme(legend.position='none') + 
   labs(x='number of class faiures', y='total alcohol consumption') + 
   facet_wrap(factor(data$sex))

# 4.  Does consumption of alcohol depend on number of parent's cohabitation status?
ggplot(data, aes(x=Pstatus, y=Talc, color=factor(Pstatus))) + 
   geom_boxplot() + 
   geom_jitter(width=.2)+ 
   theme(legend.position='none') + 
   labs(x='parent\'s cohabitation status', y='total alcohol consumption') + 
   facet_wrap(factor(data$school)) + 
   scale_x_discrete(labels = c('apart', 'together'))

# 4.  Does consumption of alcohol depend on family relation by schools?
ggplot(data, aes(x=famrel, y=Talc, color=factor(famrel))) + 
   geom_boxplot() + 
   geom_jitter(width=.2)+ 
   theme(legend.position='none') + 
   labs(x='parent\'s cohabitation status', y='total alcohol consumption') + 
   facet_wrap(factor(data$school)) + 
   scale_x_discrete(labels = c('apart', 'together'))

#====================================================================

# 5.  Does consumption of alcohol depend on number of romance by gender?
ggplot(data, aes(x=romantic, y=Talc, color=factor(romantic))) + 
   geom_boxplot() + 
   geom_jitter(width=.2)+ 
   theme(legend.position='none') + 
   labs(x='Romantice', y='total alcohol consumption') + 
   facet_wrap(factor(data$sex)) + 
   scale_x_discrete(labels = c('Yes', 'No'))

# 5.  Which type of educational support is most significant to total consumption of alcohol?
p1 <- filter(data, schoolsup=='yes', famsup=='no') %>% ggplot(.,aes(x=0, y=Talc)) + geom_boxplot()+ geom_jitter(width=.2) + scale_x_discrete(labels=NULL) + coord_cartesian(ylim = c(0, 10)) + labs(title='extra edu support', x=NULL, y='total alcohol consumption') 
p2 <- filter(data, schoolsup=='no', famsup=='yes') %>% ggplot(.,aes(x=0, y=Talc)) + geom_boxplot()+ geom_jitter(width=.2) + scale_x_discrete(labels=NULL) + coord_cartesian(ylim = c(0, 10)) + labs(title='family edu support', x=NULL, y=NULL) 
p3 <- filter(data, schoolsup=='yes', famsup=='yes') %>% ggplot(.,aes(x=0, y=Talc)) + geom_boxplot()+ geom_jitter(width=.2) + scale_x_discrete(labels=NULL) + coord_cartesian(ylim = c(0, 10)) + labs(title='w/ edu support', x=NULL, y=NULL) 
p4 <- filter(data, schoolsup=='yes', famsup=='yes') %>% ggplot(.,aes(x=0, y=Talc)) + geom_boxplot()+ geom_jitter(width=.2) + scale_x_discrete(labels=NULL) + coord_cartesian(ylim = c(0, 10)) + labs(title='w/o edu support', x=NULL, y=NULL)
grid.arrange(p1, p2, p3, p4, ncol=4)

# 5.  Which type of educational support is most significant to total consumption of alcohol?
p1 <- filter(data, schoolsup=='yes', famsup=='no') %>% ggplot(.,aes(x=0, y=AvgG)) + geom_boxplot()+ geom_jitter(width=.2) + scale_x_discrete(labels=NULL) + coord_cartesian(ylim = c(0, 10)) + labs(title='extra edu support', x=NULL, y='Grade') 
p2 <- filter(data, schoolsup=='no', famsup=='yes') %>% ggplot(.,aes(x=0, y=AvgG)) + geom_boxplot()+ geom_jitter(width=.2) + scale_x_discrete(labels=NULL) + coord_cartesian(ylim = c(0, 10)) + labs(title='family edu support', x=NULL, y=NULL) 
p3 <- filter(data, schoolsup=='yes', famsup=='yes') %>% ggplot(.,aes(x=0, y=AvgG)) + geom_boxplot()+ geom_jitter(width=.2) + scale_x_discrete(labels=NULL) + coord_cartesian(ylim = c(0, 10)) + labs(title='w/ edu support', x=NULL, y=NULL) 
p4 <- filter(data, schoolsup=='yes', famsup=='yes') %>% ggplot(.,aes(x=0, y=AvgG)) + geom_boxplot()+ geom_jitter(width=.2) + scale_x_discrete(labels=NULL) + coord_cartesian(ylim = c(0, 10)) + labs(title='w/o edu support', x=NULL, y=NULL)
grid.arrange(p1, p2, p3, p4, ncol=4)

#Does consumption of alcohol depend on extra-curricular activities?
ggplot(data, aes(x=activities, y=Talc, color=factor(activities))) + 
   geom_boxplot() + 
   geom_jitter(width=.1) + 
   facet_wrap(factor(school) ~ .) +
   labs(x='extra-curricular activities', y='total alcohol consumption') + 
   theme(legend.position='none')

# Which school class has larger cumulative alcohol consumption?
p1 <- ggplot(data, aes(reorder(school, Talc, sum), Talc, fill=school)) + 
   geom_col() + 
   theme(legend.position='none') + 
   labs(x=NULL, y='total alcohol consumption')
p2 <- ggplot(data, aes(Talc, fill=factor(school), color=factor(school)))+
   geom_bar()+ 
   facet_wrap(factor(school) ~ .) + 
   theme(legend.position='none') + 
   labs(x=NULL, y=NULL)
grid.arrange(p1, p2, ncol=2, widths=c(1, 4))

# Which gender has larger cumulative alcohol consumption?
p1 <- ggplot(data, aes(reorder(sex, Talc, sum), Talc, fill=sex)) + 
   geom_col() + 
   theme(legend.position='none') + 
   labs(x=NULL, y='total alcohol consumption')
p2 <- ggplot(data, aes(Talc, fill=factor(sex), color=factor(sex)))+
   geom_bar()+ 
   facet_wrap(factor(sex) ~ .) + 
   theme(legend.position='none') + 
   labs(x=NULL, y=NULL)
grid.arrange(p1, p2, ncol=2, widths=c(1, 4))

# 9.  Does the address or family size of a student influence their consumption of alcohol?
p1 <- ggplot(data, aes(y=Talc, color=address)) + geom_boxplot() + theme(legend.position='bottom') + scale_x_discrete(labels=NULL) 
p2 <- ggplot(data, aes(y=Talc, color=famsize)) + geom_boxplot() + theme(legend.position='bottom')+ scale_x_discrete(labels=NULL) 
grid.arrange(p1, p2, ncol=2)

# 10. What is the correlation of total consumption of alcohol and age of students? 
cor(data$Talc, data$age)

## [1] 0.1182256

qplot(data$Talc, 
      data$age, 
      data = data, 
      geom = c("point", "smooth"), 
      method = "lm", 
      alpha = I(1 / 5), 
      se = FALSE)

## Warning: Ignoring unknown parameters: method, se

## Warning: Use of `data$Talc` is discouraged. Use `Talc` instead.

## Warning: Use of `data$age` is discouraged. Use `age` instead.

## Warning: Use of `data$Talc` is discouraged. Use `Talc` instead.

## Warning: Use of `data$age` is discouraged. Use `age` instead.

## `geom_smooth()` using formula 'y ~ x'

#Does the total consumption of alcohol have a positive correlation with grades? What is the value?
select(data, Talc, G1, G2, G3) %>% cor()

##            Talc         G1         G2         G3
## Talc  1.0000000 -0.1909314 -0.1942222 -0.2088812
## G1   -0.1909314  1.0000000  0.8649816  0.8263871
## G2   -0.1942222  0.8649816  1.0000000  0.9185480
## G3   -0.2088812  0.8263871  0.9185480  1.0000000

M <- cor(data[,c(31,32,33,34)])
corrplot(M, method = "number")

#What attribute has the most significant correlation with consumption of alcohol?
cor_mat <- melt(cor(select_if(data, is.numeric)))
ggplot(melt(cor(select_if(data, is.numeric))), aes(Var1, Var2, fill=value)) + labs(x=NULL, y=NULL)+ geom_tile(color='white') + scale_fill_gradient2(low = "white", high = "red", limit = c(-1,1), name='Correlation', position='bottom') + theme_minimal() + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + coord_fixed()

# 17. Is there difference in consumption of alcohol Between Female and Male Students 
df1 <- data %>% select(sex,Dalc)
p1<-ggplot(data, aes(x=Dalc)) + geom_histogram(fill="dodgerblue", colour="black",binwidth=1) +
   facet_grid(sex ~ .)+geom_vline(data=aggregate(df1[2], df1[1], median), 
                                  mapping=aes(xintercept=Dalc), color="red")
df2 <- data %>% select(sex,Walc)
p2<-ggplot(data, aes(x=Walc)) + geom_histogram(fill="green4", colour="black",binwidth = 1) +
   facet_grid(sex ~ .)+geom_vline(data=aggregate(df2[2], df2[1], median), 
                                  mapping=aes(xintercept=Walc), color="red") 
grid.arrange(p1,p2)

# 18. Does Alcohol Consumption During Schooldays and Weekenddays Affect Students' Academic Performance?
p1 <- data[c(7:8,27:28,31:33)]
d3 <- data.frame(p1$Medu,p1$Fedu, p1$Dalc, p1$Walc, G = c(p1[ ,"G1"],p1[, "G2"], p1[, "G3"]))

sd = ggplot(d3, aes(x=G))+geom_histogram(fill = "slateblue", colour="black", binwidth =1)+
   facet_grid(p1.Dalc ~ .)+geom_vline(data = aggregate(d3[5], d3[3], median),
                                      mapping = aes(xintercept = G), color = "red" )+labs(x = "Weekend Days - Grades", y = "Numbers of People")
wd = ggplot(d3, aes(x=G))+geom_histogram(fill = "slateblue", colour="black", binwidth =1)+
   facet_grid(p1.Walc ~ .)+geom_vline(data = aggregate(d3[5], d3[3], median),
                                      mapping = aes(xintercept = G), color = "red" )+labs(x = "School Days - Grades", y = "Numbers of People")
grid.arrange(sd,wd)

# Regression Analysis
lmodel <- lm(Talc ~ ., data = data)
summary(lmodel)

## Warning in summary.lm(lmodel): essentially perfect fit: summary may be
## unreliable

## 
## Call:
## lm(formula = Talc ~ ., data = data)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -5.477e-14 -7.410e-16  2.600e-17  9.520e-16  1.909e-14 
## 
## Coefficients: (2 not defined because of singularities)
##                    Estimate Std. Error    t value Pr(>|t|)    
## (Intercept)      -7.881e-15  2.687e-15 -2.933e+00 0.003482 ** 
## schoolMS         -1.249e-15  3.570e-16 -3.498e+00 0.000502 ***
## sexM             -3.016e-15  3.286e-16 -9.178e+00  < 2e-16 ***
## age               2.959e-16  1.348e-16  2.195e+00 0.028528 *  
## addressU         -6.227e-17  3.424e-16 -1.820e-01 0.855755    
## famsizeLE3       -2.238e-16  3.207e-16 -6.980e-01 0.485492    
## PstatusT          2.063e-16  4.532e-16  4.550e-01 0.649148    
## Medu             -1.585e-16  1.981e-16 -8.000e-01 0.424027    
## Fedu             -3.308e-16  1.802e-16 -1.836e+00 0.066788 .  
## Mjobhealth        1.537e-16  7.037e-16  2.180e-01 0.827171    
## Mjobother         3.050e-16  3.961e-16  7.700e-01 0.441636    
## Mjobservices      2.608e-16  4.884e-16  5.340e-01 0.593600    
## Mjobteacher       6.745e-17  6.571e-16  1.030e-01 0.918275    
## Fjobhealth        1.977e-16  9.839e-16  2.010e-01 0.840821    
## Fjobother        -4.100e-16  5.974e-16 -6.860e-01 0.492830    
## Fjobservices     -1.253e-15  6.287e-16 -1.994e+00 0.046654 *  
## Fjobteacher      -1.263e-15  8.832e-16 -1.430e+00 0.153226    
## reasonhome       -2.239e-17  3.726e-16 -6.000e-02 0.952111    
## reasonother       4.044e-16  4.821e-16  8.390e-01 0.401916    
## reasonreputation -7.010e-17  3.904e-16 -1.800e-01 0.857571    
## guardianmother   -2.306e-16  3.473e-16 -6.640e-01 0.507068    
## guardianother    -4.634e-16  6.951e-16 -6.670e-01 0.505254    
## traveltime        5.014e-17  2.085e-16  2.400e-01 0.810027    
## studytime         2.534e-16  1.846e-16  1.372e+00 0.170429    
## failures          4.610e-16  2.774e-16  1.662e+00 0.097087 .  
## schoolsupyes     -6.033e-16  4.831e-16 -1.249e+00 0.212237    
## famsupyes         3.353e-16  2.984e-16  1.123e+00 0.261681    
## paidyes          -4.156e-17  6.042e-16 -6.900e-02 0.945179    
## activitiesyes    -7.932e-16  2.922e-16 -2.715e+00 0.006817 ** 
## nurseryyes        2.660e-17  3.547e-16  7.500e-02 0.940253    
## higheryes         2.062e-16  5.095e-16  4.050e-01 0.685908    
## internetyes       7.375e-16  3.612e-16  2.042e+00 0.041614 *  
## romanticyes      -1.326e-16  3.007e-16 -4.410e-01 0.659319    
## famrel            1.787e-16  1.525e-16  1.172e+00 0.241738    
## freetime          3.545e-16  1.469e-16  2.413e+00 0.016131 *  
## goout            -1.408e-15  1.405e-16 -1.002e+01  < 2e-16 ***
## Dalc              1.000e+00  2.003e-16  4.992e+15  < 2e-16 ***
## Walc              1.000e+00  1.548e-16  6.461e+15  < 2e-16 ***
## health           -1.093e-16  1.015e-16 -1.077e+00 0.281871    
## absences          3.928e-17  3.274e-17  1.200e+00 0.230715    
## G1                4.733e-16  1.059e-16  4.471e+00  9.3e-06 ***
## G2               -2.578e-16  1.385e-16 -1.861e+00 0.063158 .  
## G3               -7.700e-17  1.131e-16 -6.810e-01 0.496373    
## Tedu                     NA         NA         NA       NA    
## AvgG                     NA         NA         NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.481e-15 on 606 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 5.055e+30 on 42 and 606 DF,  p-value: < 2.2e-16

par(mfrow=c(2,2))
plot(lmodel)

Student Performance Dataset

Abdul Samad

June 26, 2021

Service as a Freelancer

Analysis