If you want hire me as a freelancer please use the following links.
data <- read.csv("student-por.csv")
data <- data[complete.cases(data),]
data$Talc <- data$Dalc + data$Walc
data$Tedu <- data$Fedu + data$Medu
cols <- c("school","sex","age","address","famsize","Pstatus",
"Medu","Fedu","Mjob","Fjob","reason","nursery","internet")
num<-dplyr::select(data, age, Medu, Fedu, traveltime, studytime, failures, famrel, freetime, goout,Dalc, Walc,health,absences, G1,G2,G3)
data$AvgG <- (data$G1 + data$G2 + data$G3)/3
#Histograms of variables
multi.hist(num, freq=F, dcol = "red",
dlty=c("dotted", "solid")) #Average grade of male and female
ggplot(data, aes(x = AvgG ,fill=sex))+
geom_histogram()+
labs(title = "Average Grade")## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Correlation of variabes
#res<-Hmisc::rcorr(as.matrix(num))
#corrplot(num, type="upper", order="hclust")
#Average grade of students
ggplot(data, aes(x=sex, y=AvgG, fill=sex)) +
geom_boxplot() +
theme_classic() +
labs(title="Average grade performance of Males vs Female", x="Gender", y = "Average Grade")# 1. Does the cumulative consumption of alcohol have normal distribution by gender?
p1 <- ggplot(data, aes(sample=Talc, color=school))+stat_qq()+stat_qq_line()+ theme(legend.position='none')+labs(x='theoretical quantile', y='quantile alcohol consumption')
p2 <- ggplot(data, aes(Talc, fill=factor(sex), color=factor(sex))) + geom_density() + facet_grid(factor(sex) ~ .) + theme(legend.position='none') + labs(x='density', y=NULL)
grid.arrange(p1, p2, ncol=2)# 1. Does the Average grade have normal distribution by gender?
p1 <- ggplot(data, aes(sample=AvgG, color=school))+stat_qq()+stat_qq_line()+ theme(legend.position='none')+labs(x='theoretical quantile', y='quantile Average Grade')
p2 <- ggplot(data, aes(AvgG, fill=factor(sex), color=factor(sex))) + geom_density() + facet_grid(factor(sex) ~ .) + theme(legend.position='none') + labs(x='density', y=NULL)
grid.arrange(p1, p2, ncol=2)#=============================================
# 2.1 What is the total count of workday Very High alcohol consumers (5) and Very low alcohol consumers (1) in each Gender?
data %>% filter(Dalc %in% range(Dalc)) %>% group_by(sex, Dalc) %>% summarise(counts = length(Dalc))## `summarise()` regrouping output by 'sex' (override with `.groups` argument)
## # A tibble: 4 x 3
## # Groups: sex [2]
## sex Dalc counts
## <chr> <int> <int>
## 1 F 1 305
## 2 F 5 2
## 3 M 1 146
## 4 M 5 15
# 3. Does consumption of alcohol depend on number of past class failures?
ggplot(data, aes(x=failures, y=Talc, color=factor(failures))) + geom_boxplot() + geom_jitter(width=.2)+ theme(legend.position='none') + labs(x='number of class faiures', y='total alcohol consumption') + facet_wrap(factor(data$school))# 3. Does consumption of alcohol depend on number of past class failures?
ggplot(data, aes(x=failures, y=Talc, color=factor(failures))) +
geom_boxplot() +
geom_jitter(width=.2)+
theme(legend.position='none') +
labs(x='number of class faiures', y='total alcohol consumption') +
facet_wrap(factor(data$school))# 3. Does consumption of alcohol depend on gender failures?
ggplot(data, aes(x=failures, y=Talc, color=factor(failures))) +
geom_boxplot() +
geom_jitter(width=.2)+
theme(legend.position='none') +
labs(x='number of class faiures', y='total alcohol consumption') +
facet_wrap(factor(data$sex))# 4. Does consumption of alcohol depend on number of parent's cohabitation status?
ggplot(data, aes(x=Pstatus, y=Talc, color=factor(Pstatus))) +
geom_boxplot() +
geom_jitter(width=.2)+
theme(legend.position='none') +
labs(x='parent\'s cohabitation status', y='total alcohol consumption') +
facet_wrap(factor(data$school)) +
scale_x_discrete(labels = c('apart', 'together'))# 4. Does consumption of alcohol depend on family relation by schools?
ggplot(data, aes(x=famrel, y=Talc, color=factor(famrel))) +
geom_boxplot() +
geom_jitter(width=.2)+
theme(legend.position='none') +
labs(x='parent\'s cohabitation status', y='total alcohol consumption') +
facet_wrap(factor(data$school)) +
scale_x_discrete(labels = c('apart', 'together'))#====================================================================
# 5. Does consumption of alcohol depend on number of romance by gender?
ggplot(data, aes(x=romantic, y=Talc, color=factor(romantic))) +
geom_boxplot() +
geom_jitter(width=.2)+
theme(legend.position='none') +
labs(x='Romantice', y='total alcohol consumption') +
facet_wrap(factor(data$sex)) +
scale_x_discrete(labels = c('Yes', 'No'))# 5. Which type of educational support is most significant to total consumption of alcohol?
p1 <- filter(data, schoolsup=='yes', famsup=='no') %>% ggplot(.,aes(x=0, y=Talc)) + geom_boxplot()+ geom_jitter(width=.2) + scale_x_discrete(labels=NULL) + coord_cartesian(ylim = c(0, 10)) + labs(title='extra edu support', x=NULL, y='total alcohol consumption')
p2 <- filter(data, schoolsup=='no', famsup=='yes') %>% ggplot(.,aes(x=0, y=Talc)) + geom_boxplot()+ geom_jitter(width=.2) + scale_x_discrete(labels=NULL) + coord_cartesian(ylim = c(0, 10)) + labs(title='family edu support', x=NULL, y=NULL)
p3 <- filter(data, schoolsup=='yes', famsup=='yes') %>% ggplot(.,aes(x=0, y=Talc)) + geom_boxplot()+ geom_jitter(width=.2) + scale_x_discrete(labels=NULL) + coord_cartesian(ylim = c(0, 10)) + labs(title='w/ edu support', x=NULL, y=NULL)
p4 <- filter(data, schoolsup=='yes', famsup=='yes') %>% ggplot(.,aes(x=0, y=Talc)) + geom_boxplot()+ geom_jitter(width=.2) + scale_x_discrete(labels=NULL) + coord_cartesian(ylim = c(0, 10)) + labs(title='w/o edu support', x=NULL, y=NULL)
grid.arrange(p1, p2, p3, p4, ncol=4)# 5. Which type of educational support is most significant to total consumption of alcohol?
p1 <- filter(data, schoolsup=='yes', famsup=='no') %>% ggplot(.,aes(x=0, y=AvgG)) + geom_boxplot()+ geom_jitter(width=.2) + scale_x_discrete(labels=NULL) + coord_cartesian(ylim = c(0, 10)) + labs(title='extra edu support', x=NULL, y='Grade')
p2 <- filter(data, schoolsup=='no', famsup=='yes') %>% ggplot(.,aes(x=0, y=AvgG)) + geom_boxplot()+ geom_jitter(width=.2) + scale_x_discrete(labels=NULL) + coord_cartesian(ylim = c(0, 10)) + labs(title='family edu support', x=NULL, y=NULL)
p3 <- filter(data, schoolsup=='yes', famsup=='yes') %>% ggplot(.,aes(x=0, y=AvgG)) + geom_boxplot()+ geom_jitter(width=.2) + scale_x_discrete(labels=NULL) + coord_cartesian(ylim = c(0, 10)) + labs(title='w/ edu support', x=NULL, y=NULL)
p4 <- filter(data, schoolsup=='yes', famsup=='yes') %>% ggplot(.,aes(x=0, y=AvgG)) + geom_boxplot()+ geom_jitter(width=.2) + scale_x_discrete(labels=NULL) + coord_cartesian(ylim = c(0, 10)) + labs(title='w/o edu support', x=NULL, y=NULL)
grid.arrange(p1, p2, p3, p4, ncol=4)#Does consumption of alcohol depend on extra-curricular activities?
ggplot(data, aes(x=activities, y=Talc, color=factor(activities))) +
geom_boxplot() +
geom_jitter(width=.1) +
facet_wrap(factor(school) ~ .) +
labs(x='extra-curricular activities', y='total alcohol consumption') +
theme(legend.position='none') # Which school class has larger cumulative alcohol consumption?
p1 <- ggplot(data, aes(reorder(school, Talc, sum), Talc, fill=school)) +
geom_col() +
theme(legend.position='none') +
labs(x=NULL, y='total alcohol consumption')
p2 <- ggplot(data, aes(Talc, fill=factor(school), color=factor(school)))+
geom_bar()+
facet_wrap(factor(school) ~ .) +
theme(legend.position='none') +
labs(x=NULL, y=NULL)
grid.arrange(p1, p2, ncol=2, widths=c(1, 4))# Which gender has larger cumulative alcohol consumption?
p1 <- ggplot(data, aes(reorder(sex, Talc, sum), Talc, fill=sex)) +
geom_col() +
theme(legend.position='none') +
labs(x=NULL, y='total alcohol consumption')
p2 <- ggplot(data, aes(Talc, fill=factor(sex), color=factor(sex)))+
geom_bar()+
facet_wrap(factor(sex) ~ .) +
theme(legend.position='none') +
labs(x=NULL, y=NULL)
grid.arrange(p1, p2, ncol=2, widths=c(1, 4))# 9. Does the address or family size of a student influence their consumption of alcohol?
p1 <- ggplot(data, aes(y=Talc, color=address)) + geom_boxplot() + theme(legend.position='bottom') + scale_x_discrete(labels=NULL)
p2 <- ggplot(data, aes(y=Talc, color=famsize)) + geom_boxplot() + theme(legend.position='bottom')+ scale_x_discrete(labels=NULL)
grid.arrange(p1, p2, ncol=2)# 10. What is the correlation of total consumption of alcohol and age of students?
cor(data$Talc, data$age)## [1] 0.1182256
qplot(data$Talc,
data$age,
data = data,
geom = c("point", "smooth"),
method = "lm",
alpha = I(1 / 5),
se = FALSE)## Warning: Ignoring unknown parameters: method, se
## Warning: Use of `data$Talc` is discouraged. Use `Talc` instead.
## Warning: Use of `data$age` is discouraged. Use `age` instead.
## Warning: Use of `data$Talc` is discouraged. Use `Talc` instead.
## Warning: Use of `data$age` is discouraged. Use `age` instead.
## `geom_smooth()` using formula 'y ~ x'
#Does the total consumption of alcohol have a positive correlation with grades? What is the value?
select(data, Talc, G1, G2, G3) %>% cor()## Talc G1 G2 G3
## Talc 1.0000000 -0.1909314 -0.1942222 -0.2088812
## G1 -0.1909314 1.0000000 0.8649816 0.8263871
## G2 -0.1942222 0.8649816 1.0000000 0.9185480
## G3 -0.2088812 0.8263871 0.9185480 1.0000000
M <- cor(data[,c(31,32,33,34)])
corrplot(M, method = "number")#What attribute has the most significant correlation with consumption of alcohol?
cor_mat <- melt(cor(select_if(data, is.numeric)))
ggplot(melt(cor(select_if(data, is.numeric))), aes(Var1, Var2, fill=value)) + labs(x=NULL, y=NULL)+ geom_tile(color='white') + scale_fill_gradient2(low = "white", high = "red", limit = c(-1,1), name='Correlation', position='bottom') + theme_minimal() + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + coord_fixed()# 17. Is there difference in consumption of alcohol Between Female and Male Students
df1 <- data %>% select(sex,Dalc)
p1<-ggplot(data, aes(x=Dalc)) + geom_histogram(fill="dodgerblue", colour="black",binwidth=1) +
facet_grid(sex ~ .)+geom_vline(data=aggregate(df1[2], df1[1], median),
mapping=aes(xintercept=Dalc), color="red")
df2 <- data %>% select(sex,Walc)
p2<-ggplot(data, aes(x=Walc)) + geom_histogram(fill="green4", colour="black",binwidth = 1) +
facet_grid(sex ~ .)+geom_vline(data=aggregate(df2[2], df2[1], median),
mapping=aes(xintercept=Walc), color="red")
grid.arrange(p1,p2)# 18. Does Alcohol Consumption During Schooldays and Weekenddays Affect Students' Academic Performance?
p1 <- data[c(7:8,27:28,31:33)]
d3 <- data.frame(p1$Medu,p1$Fedu, p1$Dalc, p1$Walc, G = c(p1[ ,"G1"],p1[, "G2"], p1[, "G3"]))
sd = ggplot(d3, aes(x=G))+geom_histogram(fill = "slateblue", colour="black", binwidth =1)+
facet_grid(p1.Dalc ~ .)+geom_vline(data = aggregate(d3[5], d3[3], median),
mapping = aes(xintercept = G), color = "red" )+labs(x = "Weekend Days - Grades", y = "Numbers of People")
wd = ggplot(d3, aes(x=G))+geom_histogram(fill = "slateblue", colour="black", binwidth =1)+
facet_grid(p1.Walc ~ .)+geom_vline(data = aggregate(d3[5], d3[3], median),
mapping = aes(xintercept = G), color = "red" )+labs(x = "School Days - Grades", y = "Numbers of People")
grid.arrange(sd,wd)# Regression Analysis
lmodel <- lm(Talc ~ ., data = data)
summary(lmodel)## Warning in summary.lm(lmodel): essentially perfect fit: summary may be
## unreliable
##
## Call:
## lm(formula = Talc ~ ., data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.477e-14 -7.410e-16 2.600e-17 9.520e-16 1.909e-14
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.881e-15 2.687e-15 -2.933e+00 0.003482 **
## schoolMS -1.249e-15 3.570e-16 -3.498e+00 0.000502 ***
## sexM -3.016e-15 3.286e-16 -9.178e+00 < 2e-16 ***
## age 2.959e-16 1.348e-16 2.195e+00 0.028528 *
## addressU -6.227e-17 3.424e-16 -1.820e-01 0.855755
## famsizeLE3 -2.238e-16 3.207e-16 -6.980e-01 0.485492
## PstatusT 2.063e-16 4.532e-16 4.550e-01 0.649148
## Medu -1.585e-16 1.981e-16 -8.000e-01 0.424027
## Fedu -3.308e-16 1.802e-16 -1.836e+00 0.066788 .
## Mjobhealth 1.537e-16 7.037e-16 2.180e-01 0.827171
## Mjobother 3.050e-16 3.961e-16 7.700e-01 0.441636
## Mjobservices 2.608e-16 4.884e-16 5.340e-01 0.593600
## Mjobteacher 6.745e-17 6.571e-16 1.030e-01 0.918275
## Fjobhealth 1.977e-16 9.839e-16 2.010e-01 0.840821
## Fjobother -4.100e-16 5.974e-16 -6.860e-01 0.492830
## Fjobservices -1.253e-15 6.287e-16 -1.994e+00 0.046654 *
## Fjobteacher -1.263e-15 8.832e-16 -1.430e+00 0.153226
## reasonhome -2.239e-17 3.726e-16 -6.000e-02 0.952111
## reasonother 4.044e-16 4.821e-16 8.390e-01 0.401916
## reasonreputation -7.010e-17 3.904e-16 -1.800e-01 0.857571
## guardianmother -2.306e-16 3.473e-16 -6.640e-01 0.507068
## guardianother -4.634e-16 6.951e-16 -6.670e-01 0.505254
## traveltime 5.014e-17 2.085e-16 2.400e-01 0.810027
## studytime 2.534e-16 1.846e-16 1.372e+00 0.170429
## failures 4.610e-16 2.774e-16 1.662e+00 0.097087 .
## schoolsupyes -6.033e-16 4.831e-16 -1.249e+00 0.212237
## famsupyes 3.353e-16 2.984e-16 1.123e+00 0.261681
## paidyes -4.156e-17 6.042e-16 -6.900e-02 0.945179
## activitiesyes -7.932e-16 2.922e-16 -2.715e+00 0.006817 **
## nurseryyes 2.660e-17 3.547e-16 7.500e-02 0.940253
## higheryes 2.062e-16 5.095e-16 4.050e-01 0.685908
## internetyes 7.375e-16 3.612e-16 2.042e+00 0.041614 *
## romanticyes -1.326e-16 3.007e-16 -4.410e-01 0.659319
## famrel 1.787e-16 1.525e-16 1.172e+00 0.241738
## freetime 3.545e-16 1.469e-16 2.413e+00 0.016131 *
## goout -1.408e-15 1.405e-16 -1.002e+01 < 2e-16 ***
## Dalc 1.000e+00 2.003e-16 4.992e+15 < 2e-16 ***
## Walc 1.000e+00 1.548e-16 6.461e+15 < 2e-16 ***
## health -1.093e-16 1.015e-16 -1.077e+00 0.281871
## absences 3.928e-17 3.274e-17 1.200e+00 0.230715
## G1 4.733e-16 1.059e-16 4.471e+00 9.3e-06 ***
## G2 -2.578e-16 1.385e-16 -1.861e+00 0.063158 .
## G3 -7.700e-17 1.131e-16 -6.810e-01 0.496373
## Tedu NA NA NA NA
## AvgG NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.481e-15 on 606 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 5.055e+30 on 42 and 606 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(lmodel)