## load libraries
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(class)
library(rpart)
library(rpart.plot)
library(e1071)
library(caret)
## Loading required package: lattice
library(caTools)
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
##
## Attaching package: 'party'
## The following object is masked from 'package:dplyr':
##
## where
## input
data <- read.csv("xAPI-Edu-Data.csv")
str(data)
## 'data.frame': 480 obs. of 17 variables:
## $ gender : chr "M" "M" "M" "M" ...
## $ NationalITy : chr "KW" "KW" "KW" "KW" ...
## $ PlaceofBirth : chr "KuwaIT" "KuwaIT" "KuwaIT" "KuwaIT" ...
## $ StageID : chr "lowerlevel" "lowerlevel" "lowerlevel" "lowerlevel" ...
## $ GradeID : chr "G-04" "G-04" "G-04" "G-04" ...
## $ SectionID : chr "A" "A" "A" "A" ...
## $ Topic : chr "IT" "IT" "IT" "IT" ...
## $ Semester : chr "F" "F" "F" "F" ...
## $ Relation : chr "Father" "Father" "Father" "Father" ...
## $ raisedhands : int 15 20 10 30 40 42 35 50 12 70 ...
## $ VisITedResources : int 16 20 7 25 50 30 12 10 21 80 ...
## $ AnnouncementsView : int 2 3 0 5 12 13 0 15 16 25 ...
## $ Discussion : int 20 25 30 35 50 70 17 22 50 70 ...
## $ ParentAnsweringSurvey : chr "Yes" "Yes" "No" "No" ...
## $ ParentschoolSatisfaction: chr "Good" "Good" "Bad" "Bad" ...
## $ StudentAbsenceDays : chr "Under-7" "Under-7" "Above-7" "Above-7" ...
## $ Class : chr "M" "M" "L" "L" ...
summary(data)
## gender NationalITy PlaceofBirth StageID
## Length:480 Length:480 Length:480 Length:480
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## GradeID SectionID Topic Semester
## Length:480 Length:480 Length:480 Length:480
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Relation raisedhands VisITedResources AnnouncementsView
## Length:480 Min. : 0.00 Min. : 0.0 Min. : 0.00
## Class :character 1st Qu.: 15.75 1st Qu.:20.0 1st Qu.:14.00
## Mode :character Median : 50.00 Median :65.0 Median :33.00
## Mean : 46.77 Mean :54.8 Mean :37.92
## 3rd Qu.: 75.00 3rd Qu.:84.0 3rd Qu.:58.00
## Max. :100.00 Max. :99.0 Max. :98.00
## Discussion ParentAnsweringSurvey ParentschoolSatisfaction
## Min. : 1.00 Length:480 Length:480
## 1st Qu.:20.00 Class :character Class :character
## Median :39.00 Mode :character Mode :character
## Mean :43.28
## 3rd Qu.:70.00
## Max. :99.00
## StudentAbsenceDays Class
## Length:480 Length:480
## Class :character Class :character
## Mode :character Mode :character
##
##
##
Análisis exploratiro de los datos
Histogramas
ggplot(data = data, aes(x = raisedhands)) + geom_histogram(color = "black") +
scale_x_continuous(breaks = seq(0,100,5)) +
labs(x = "Raised Hands", y = "Student Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = data, aes(x = VisITedResources)) + geom_histogram(color = "black") +
scale_x_continuous(breaks = seq(0,100,5)) +
labs(x = "Visited Resources", y = "Student Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = data, aes(x = AnnouncementsView)) + geom_histogram(color = "black") +
scale_x_continuous(breaks = seq(0,100,5)) +
labs(x = "Announcements View", y = "Student Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = data, aes(x = Discussion)) + geom_histogram(color = "black") +
scale_x_continuous(breaks = seq(0,100,5)) +
labs(x = "Discussion Participation", y = "Student Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Gráfica de barras
ggplot(data = data, aes(x = gender)) + geom_bar() +
labs(x = "Gender", y = "Student Count") +
scale_y_continuous(breaks = seq(0,300,30)) + coord_flip()

ggplot(data = data, aes(x = NationalITy)) + geom_bar() +
labs(x = "Nationality", y = "Student Count") +
scale_y_continuous(breaks = seq(0,200,20)) + coord_flip()

Jordan y KW tienen el mayor número de estudiantes
ggplot(data = data, aes(x = PlaceofBirth)) + geom_bar(aes(fill = NationalITy)) +
labs(x = "Birth Place", y = "Student Count") + coord_flip() # usa is a mix of nationalities

ggplot(data = data, aes(x = GradeID, fill = Class)) + geom_bar() +
labs(x = "Grade ID", y = "Student Count") + coord_flip() # g-06 has students with only low grades

ggplot(data = data, aes(x = GradeID, fill = gender)) + geom_bar() +
labs(x = "Grade ID", y = "Student Count") + coord_flip() # g-10 has no females

ggplot(data = data, aes(x = SectionID, fill = Topic)) + geom_bar() +
labs(x = "Section ID", y = "Student Count") +
coord_flip()

La clase C tiene solamente estudiantes de TI y ciencias
ggplot(data = data, aes(x = Topic, fill = gender)) + geom_bar() +
labs(x = "Topic", y = "Student Count") +
scale_y_continuous(breaks = seq(0,100,4)) + coord_flip()

El español tiene un peor hombres:mujeres ratio mientars que ciencia,
química, inglés y fránces que tiene un buen ratio.
ggplot(data = data, aes(x = Topic, fill = NationalITy)) + geom_bar() +
labs(x = "Topic", y = "Student Count") + coord_flip() +
scale_y_continuous(breaks = seq(0,100,4))

It tiene la mayor cantidad de personas de KW. Química tiene la menor
diverdiad. Fráncia tiene la mayor diversidad.
ggplot(data = data, aes(x = Topic, fill = StageID)) + geom_bar() +
labs(x = "Topic", y = "Student Count") + coord_flip() +
scale_y_continuous(breaks = seq(0,100,4))

Geología, biología y química sólo en la escuela secundaria. El
fránces tiene todo en bajo nivel, a excepción de un estudiante de alto
nivel.
ggplot(data = data, aes(x = Topic, fill = SectionID)) + geom_bar() +
labs(x = "Topic", y = "Student Count") + coord_flip() +
scale_y_continuous(breaks = seq(0,100,4))

La sección C tiene estudiantes de español y Ti.
ggplot(data = data, aes(x = Topic, fill = Semester)) + geom_bar() +
labs(x = "Topic", y = "Student Count") + coord_flip() +
scale_y_continuous(breaks = seq(0,100,4))

Ti mayormente tiene estudiantes que son de primer semestre.
ggplot(data = data, aes(x = Topic, fill = Relation)) + geom_bar() +
labs(x = "Topic", y = "Student Count") + coord_flip() +
scale_y_continuous(breaks = seq(0,100,4))

La mayoría de los estudiantes franceses tienen mamá como tutor,
mientras que la mayoría de los estudiantes de TI tienen padres como
tutores.
ggplot(data = data, aes(x = Topic, fill = Class)) + geom_bar() +
labs(x = "Topic", y = "Student Count") + coord_flip() +
scale_y_continuous(breaks = seq(0,100,4))

ggplot(data = data, aes(x = Topic, fill = Class)) + geom_bar(position = "fill") +
labs(x = "Topic", y = "Student Count") + coord_flip() +
scale_y_continuous(breaks = seq(0,100,4))

La geología no tiene estudiantes de clase baja :0 (Somos
inteligentes B) )
ggplot(data = data, aes(x = Semester)) + geom_bar() +
labs(x = "Semester", y = "Student Count")

ggplot(data = data, aes(x = Relation, fill = Semester)) + geom_bar() +
labs(x = "Gaurdian", y = "Student Count")

ggplot(data = data, aes(x = ParentAnsweringSurvey, fill = ParentschoolSatisfaction)) +
geom_bar() +
labs(x = "Does parent answer survey ?", y = "Student Count")

La mayoría de los padres que no están satisfechos con la escuela no
responden a la encuesta.
ggplot(data = data, aes(x = ParentschoolSatisfaction)) +
geom_bar() +
labs(x = "Is the parent satified with the school ?", y = "Student Count")

ggplot(data = data, aes(x = StudentAbsenceDays)) + geom_bar() +
labs(x = "Is the student absent for more than seven days", y = "Student Count")

ggplot(data = data, aes(x = Class, fill = gender)) + geom_bar() +
labs(x = "Class", y = "Student Count")

Muy pocas chicas en clase baja
ggplot(data = data, aes(x = Class, fill = Relation)) + geom_bar() +
labs(x = "Class", y = "Student Count")

Los estudiantes que tienen madres como guardianes tienen mayores
posibilidades de obtener calificaciones de clase alta.
ggplot(data = data, aes(x = Class, fill = ParentAnsweringSurvey)) + geom_bar() +
labs(x = "Class", y = "Student Count")

Los estudiantes cuyos padres responden a la encuesta son los que
obtienen buenas calificaciones.
ggplot(data = data, aes(x = Class, fill = StudentAbsenceDays)) + geom_bar() +
labs(x = "Class", y = "Student Count")

Los estudiantes que se ponen ausentes son los que obtienen bajas
calificaciones.
Gráfico de cajas
ggplot(data = data, aes(x = gender, y = raisedhands)) + geom_boxplot()

Las chicas tienen más aumentos a mano
ggplot(data = data, aes(x = gender, y = VisITedResources)) + geom_boxplot()

Las chicas visitan más recursos
ggplot(data = data, aes(x = NationalITy, y = raisedhands)) + geom_boxplot()

Jordan más aumentos de mano que KW. Lybia más baja. Iraq y Palestina
más altos de mano.
ggplot(data = data, aes(x = StageID, y = raisedhands)) + geom_boxplot()

Más discusiones y charla en la escuela secundaria
ggplot(data = data, aes(x = GradeID, y = raisedhands)) + geom_boxplot()

El grado 6 tiene la mayor cantidad de aumentos de aza de manos en
promedio.
ggplot(data = data, aes(x = SectionID, y = Discussion)) + geom_boxplot()

La sección C tiene la discusión más baja
ggplot(data = data, aes(x = Topic, y = raisedhands)) + geom_boxplot()

IT tiene muy pocos aumentos de mano intersiletes, aunque la mayoría
de los estudiantes estudian allí.
ggplot(data = data, aes(x = Semester, y = raisedhands)) + geom_boxplot()

En segundo semestre los estudiantes levantan la mano
ggplot(data = data, aes(x = Relation, y = raisedhands)) + geom_boxplot()

Cuando las madres son las tutoras hay mayor cantidad de
participación (manos levantadas)
ggplot(data = data, aes(x = ParentAnsweringSurvey, y = raisedhands)) + geom_boxplot()

Respuesta de la encuesta fue positiva hubo más aumentos de la
mano
ggplot(data = data, aes(x = ParentAnsweringSurvey, y = VisITedResources)) + geom_boxplot()

ggplot(data = data, aes(x = ParentAnsweringSurvey, y = AnnouncementsView)) + geom_boxplot()

ggplot(data = data, aes(x = ParentAnsweringSurvey, y = Discussion)) + geom_boxplot()

ggplot(data = data, aes(x = ParentschoolSatisfaction, y = raisedhands)) + geom_boxplot()

Si la satisfacción fue buena entonces hay mayor aumentos de alza de
mano (participación)
ggplot(data = data, aes(x = ParentschoolSatisfaction, y = VisITedResources)) + geom_boxplot()

ggplot(data = data, aes(x = ParentschoolSatisfaction, y = AnnouncementsView)) + geom_boxplot()

ggplot(data = data, aes(x = ParentschoolSatisfaction, y = Discussion)) + geom_boxplot()

ggplot(data = data, aes(x = StudentAbsenceDays, y = raisedhands)) + geom_boxplot()

Mayor cantidad de faltas, menor cantidad de participación
ggplot(data = data, aes(x = StudentAbsenceDays, y = VisITedResources)) + geom_boxplot()

ggplot(data = data, aes(x = StudentAbsenceDays, y = AnnouncementsView)) + geom_boxplot()

ggplot(data = data, aes(x = StudentAbsenceDays, y = Discussion)) + geom_boxplot()

ggplot(data = data, aes(x = ParentAnsweringSurvey, y = raisedhands)) + geom_boxplot()

survy answer- yes –> more hand raises
ggplot(data = data, aes(x = ParentAnsweringSurvey, y = VisITedResources)) + geom_boxplot()

ggplot(data = data, aes(x = ParentAnsweringSurvey, y = AnnouncementsView)) + geom_boxplot()

ggplot(data = data, aes(x = ParentAnsweringSurvey, y = Discussion)) + geom_boxplot()

Gráficos de caja de clase
ggplot(data = data, aes(x = Class, y = raisedhands)) + geom_boxplot()

Marcas altas -> elevaciones de mano alta
ggplot(data = data, aes(x = Class, y = VisITedResources)) + geom_boxplot()

Altas calificaciones cuando se revisan (visitan) más los
recursos
ggplot(data = data, aes(x = Class, y = AnnouncementsView)) + geom_boxplot()

MEjores marcas cuando hay más vistas de anuncios
ggplot(data = data, aes(x = Class, y = Discussion)) + geom_boxplot()

Cuando hay mayores marcas hay mayores discusiones
Gráfico de dispersión
ggplot(data = data, aes( x = raisedhands, y = VisITedResources)) + geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data = data, aes( x = raisedhands, y = AnnouncementsView)) + geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data = data, aes( x = raisedhands, y = Discussion)) + geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data = data, aes( x = VisITedResources, y = AnnouncementsView)) + geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data = data, aes( x = VisITedResources, y = Discussion)) + geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data = data, aes( x = AnnouncementsView, y = Discussion)) + geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

Gráfica de densidad
ggplot(data = data, aes(x = raisedhands, color = gender)) + geom_density()

ggplot(data = data, aes(x = raisedhands, color = Topic)) + geom_density()

ggplot(data = data, aes(x = raisedhands, color = SectionID)) + geom_density()

ggplot(data = data, aes(x = raisedhands, color = Semester)) + geom_density()

ggplot(data = data, aes(x = raisedhands, color = Class)) + geom_density()

Tile Map
tile.map <- data %>% group_by(gender, NationalITy) %>%
summarise(Count = n()) %>% arrange(desc(Count))
## `summarise()` has grouped output by 'gender'. You can override using the
## `.groups` argument.
ggplot(data = tile.map, aes(x = gender, NationalITy, fill = Count)) + geom_tile()

Modelado predictivo
Dividir datos en conjuntos de tren y validación cruzada.
set.seed(55)
split <- sample.split(data$Class, SplitRatio = 0.75)
train <- subset(data, split == T)
cv <- subset(data, split == F)
Árbol de decisiones
tree.model <- rpart(Class ~ ., data = train, method = "class", minbucket = 1)
prp(tree.model)

accuracy = 0.7355(minbucket = 20)
accuracy = 0.7603(minbucket = 10)
accuracy = 0.8181(minbucket = 1)
Conclusiones