## load libraries 

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(class)
library(rpart)
library(rpart.plot)
library(e1071)
library(caret)
## Loading required package: lattice
library(caTools)
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## 
## Attaching package: 'party'
## The following object is masked from 'package:dplyr':
## 
##     where
## input 

data <- read.csv("xAPI-Edu-Data.csv")
str(data)
## 'data.frame':    480 obs. of  17 variables:
##  $ gender                  : chr  "M" "M" "M" "M" ...
##  $ NationalITy             : chr  "KW" "KW" "KW" "KW" ...
##  $ PlaceofBirth            : chr  "KuwaIT" "KuwaIT" "KuwaIT" "KuwaIT" ...
##  $ StageID                 : chr  "lowerlevel" "lowerlevel" "lowerlevel" "lowerlevel" ...
##  $ GradeID                 : chr  "G-04" "G-04" "G-04" "G-04" ...
##  $ SectionID               : chr  "A" "A" "A" "A" ...
##  $ Topic                   : chr  "IT" "IT" "IT" "IT" ...
##  $ Semester                : chr  "F" "F" "F" "F" ...
##  $ Relation                : chr  "Father" "Father" "Father" "Father" ...
##  $ raisedhands             : int  15 20 10 30 40 42 35 50 12 70 ...
##  $ VisITedResources        : int  16 20 7 25 50 30 12 10 21 80 ...
##  $ AnnouncementsView       : int  2 3 0 5 12 13 0 15 16 25 ...
##  $ Discussion              : int  20 25 30 35 50 70 17 22 50 70 ...
##  $ ParentAnsweringSurvey   : chr  "Yes" "Yes" "No" "No" ...
##  $ ParentschoolSatisfaction: chr  "Good" "Good" "Bad" "Bad" ...
##  $ StudentAbsenceDays      : chr  "Under-7" "Under-7" "Above-7" "Above-7" ...
##  $ Class                   : chr  "M" "M" "L" "L" ...
summary(data)
##     gender          NationalITy        PlaceofBirth         StageID         
##  Length:480         Length:480         Length:480         Length:480        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    GradeID           SectionID            Topic             Semester        
##  Length:480         Length:480         Length:480         Length:480        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    Relation          raisedhands     VisITedResources AnnouncementsView
##  Length:480         Min.   :  0.00   Min.   : 0.0     Min.   : 0.00    
##  Class :character   1st Qu.: 15.75   1st Qu.:20.0     1st Qu.:14.00    
##  Mode  :character   Median : 50.00   Median :65.0     Median :33.00    
##                     Mean   : 46.77   Mean   :54.8     Mean   :37.92    
##                     3rd Qu.: 75.00   3rd Qu.:84.0     3rd Qu.:58.00    
##                     Max.   :100.00   Max.   :99.0     Max.   :98.00    
##    Discussion    ParentAnsweringSurvey ParentschoolSatisfaction
##  Min.   : 1.00   Length:480            Length:480              
##  1st Qu.:20.00   Class :character      Class :character        
##  Median :39.00   Mode  :character      Mode  :character        
##  Mean   :43.28                                                 
##  3rd Qu.:70.00                                                 
##  Max.   :99.00                                                 
##  StudentAbsenceDays    Class          
##  Length:480         Length:480        
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 

Análisis exploratiro de los datos

Histogramas

ggplot(data = data, aes(x = raisedhands)) + geom_histogram(color = "black") +
    scale_x_continuous(breaks = seq(0,100,5)) + 
    labs(x = "Raised Hands", y = "Student Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = data, aes(x = VisITedResources)) + geom_histogram(color = "black") +
    scale_x_continuous(breaks = seq(0,100,5)) + 
    labs(x = "Visited Resources", y = "Student Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = data, aes(x = AnnouncementsView)) + geom_histogram(color = "black") +
    scale_x_continuous(breaks = seq(0,100,5)) + 
    labs(x = "Announcements View", y = "Student Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = data, aes(x = Discussion)) + geom_histogram(color = "black") +
    scale_x_continuous(breaks = seq(0,100,5)) + 
    labs(x = "Discussion Participation", y = "Student Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Gráfica de barras

ggplot(data = data, aes(x = gender)) + geom_bar() + 
    labs(x = "Gender", y = "Student Count") +
    scale_y_continuous(breaks = seq(0,300,30)) + coord_flip()

ggplot(data = data, aes(x = NationalITy)) + geom_bar() + 
    labs(x = "Nationality", y = "Student Count") +
    scale_y_continuous(breaks = seq(0,200,20)) + coord_flip()

Jordan y KW tienen el mayor número de estudiantes

ggplot(data = data, aes(x = PlaceofBirth)) + geom_bar(aes(fill = NationalITy)) + 
    labs(x = "Birth Place", y = "Student Count") + coord_flip() # usa is a mix of nationalities

ggplot(data = data, aes(x = GradeID, fill = Class)) + geom_bar() + 
    labs(x = "Grade ID", y = "Student Count") + coord_flip() # g-06 has students with only low grades

ggplot(data = data, aes(x = GradeID, fill = gender)) + geom_bar() + 
    labs(x = "Grade ID", y = "Student Count") + coord_flip() # g-10 has no females

ggplot(data = data, aes(x = SectionID, fill = Topic)) + geom_bar() +
    labs(x = "Section ID", y = "Student Count") +
    coord_flip()

La clase C tiene solamente estudiantes de TI y ciencias

ggplot(data = data, aes(x = Topic, fill = gender)) + geom_bar() +
    labs(x = "Topic", y = "Student Count") +
    scale_y_continuous(breaks = seq(0,100,4)) + coord_flip()

El español tiene un peor hombres:mujeres ratio mientars que ciencia, química, inglés y fránces que tiene un buen ratio.

ggplot(data = data, aes(x = Topic, fill = NationalITy)) + geom_bar() +
    labs(x = "Topic", y = "Student Count") + coord_flip() +
    scale_y_continuous(breaks = seq(0,100,4))

It tiene la mayor cantidad de personas de KW. Química tiene la menor diverdiad. Fráncia tiene la mayor diversidad.

ggplot(data = data, aes(x = Topic, fill = StageID)) + geom_bar() +
    labs(x = "Topic", y = "Student Count") + coord_flip() +
    scale_y_continuous(breaks = seq(0,100,4))

Geología, biología y química sólo en la escuela secundaria. El fránces tiene todo en bajo nivel, a excepción de un estudiante de alto nivel.

ggplot(data = data, aes(x = Topic, fill = SectionID)) + geom_bar() +
    labs(x = "Topic", y = "Student Count") + coord_flip() +
    scale_y_continuous(breaks = seq(0,100,4))

La sección C tiene estudiantes de español y Ti.

ggplot(data = data, aes(x = Topic, fill = Semester)) + geom_bar() +
    labs(x = "Topic", y = "Student Count") + coord_flip() +
    scale_y_continuous(breaks = seq(0,100,4))

Ti mayormente tiene estudiantes que son de primer semestre.

ggplot(data = data, aes(x = Topic, fill = Relation)) + geom_bar() +
    labs(x = "Topic", y = "Student Count") + coord_flip() +
    scale_y_continuous(breaks = seq(0,100,4))

La mayoría de los estudiantes franceses tienen mamá como tutor, mientras que la mayoría de los estudiantes de TI tienen padres como tutores.

ggplot(data = data, aes(x = Topic, fill = Class)) + geom_bar() +
    labs(x = "Topic", y = "Student Count") + coord_flip() +
    scale_y_continuous(breaks = seq(0,100,4))

ggplot(data = data, aes(x = Topic, fill = Class)) + geom_bar(position = "fill") +
    labs(x = "Topic", y = "Student Count") + coord_flip() +
    scale_y_continuous(breaks = seq(0,100,4))

La geología no tiene estudiantes de clase baja :0 (Somos inteligentes B) )

ggplot(data = data, aes(x = Semester)) + geom_bar() + 
    labs(x = "Semester", y = "Student Count")

ggplot(data = data, aes(x = Relation, fill = Semester)) + geom_bar() +
    labs(x = "Gaurdian", y = "Student Count")

ggplot(data = data, aes(x = ParentAnsweringSurvey, fill = ParentschoolSatisfaction)) +
    geom_bar() + 
    labs(x = "Does parent answer survey ?", y = "Student Count")

La mayoría de los padres que no están satisfechos con la escuela no responden a la encuesta.

ggplot(data = data, aes(x = ParentschoolSatisfaction)) +
    geom_bar() + 
    labs(x = "Is the parent satified with the school ?", y = "Student Count")

ggplot(data = data, aes(x = StudentAbsenceDays)) + geom_bar() + 
            labs(x = "Is the student absent for more than seven days", y = "Student Count")

ggplot(data = data, aes(x = Class, fill = gender)) + geom_bar() +
    labs(x = "Class", y = "Student Count")

Muy pocas chicas en clase baja

ggplot(data = data, aes(x = Class, fill = Relation)) + geom_bar() +
    labs(x = "Class", y = "Student Count")

Los estudiantes que tienen madres como guardianes tienen mayores posibilidades de obtener calificaciones de clase alta.

ggplot(data = data, aes(x = Class, fill = ParentAnsweringSurvey)) + geom_bar() +
    labs(x = "Class", y = "Student Count")

Los estudiantes cuyos padres responden a la encuesta son los que obtienen buenas calificaciones.

ggplot(data = data, aes(x = Class, fill = StudentAbsenceDays)) + geom_bar() +
    labs(x = "Class", y = "Student Count")

Los estudiantes que se ponen ausentes son los que obtienen bajas calificaciones.

Gráfico de cajas

ggplot(data = data, aes(x = gender, y = raisedhands)) + geom_boxplot()

Las chicas tienen más aumentos a mano

ggplot(data = data, aes(x = gender, y = VisITedResources)) + geom_boxplot()

Las chicas visitan más recursos

ggplot(data = data, aes(x = NationalITy, y = raisedhands)) + geom_boxplot()

Jordan más aumentos de mano que KW. Lybia más baja. Iraq y Palestina más altos de mano.

ggplot(data = data, aes(x = StageID, y = raisedhands)) + geom_boxplot()

Más aumentos de mano alzadas en las escuelas intermedias.

ggplot(data = data, aes(x = StageID, y = Discussion)) + geom_boxplot()

Más discusiones y charla en la escuela secundaria

ggplot(data = data, aes(x = GradeID, y = raisedhands)) + geom_boxplot()

El grado 6 tiene la mayor cantidad de aumentos de aza de manos en promedio.

ggplot(data = data, aes(x = SectionID, y = Discussion)) + geom_boxplot()

La sección C tiene la discusión más baja

ggplot(data = data, aes(x = Topic, y = raisedhands)) + geom_boxplot()

IT tiene muy pocos aumentos de mano intersiletes, aunque la mayoría de los estudiantes estudian allí.

ggplot(data = data, aes(x = Semester, y = raisedhands)) + geom_boxplot()

En segundo semestre los estudiantes levantan la mano

ggplot(data = data, aes(x = Relation, y = raisedhands)) + geom_boxplot()

Cuando las madres son las tutoras hay mayor cantidad de participación (manos levantadas)

ggplot(data = data, aes(x = ParentAnsweringSurvey, y = raisedhands)) + geom_boxplot()

Respuesta de la encuesta fue positiva hubo más aumentos de la mano

ggplot(data = data, aes(x = ParentAnsweringSurvey, y = VisITedResources)) + geom_boxplot()

ggplot(data = data, aes(x = ParentAnsweringSurvey, y = AnnouncementsView)) + geom_boxplot()

ggplot(data = data, aes(x = ParentAnsweringSurvey, y = Discussion)) + geom_boxplot()

ggplot(data = data, aes(x = ParentschoolSatisfaction, y = raisedhands)) + geom_boxplot()

Si la satisfacción fue buena entonces hay mayor aumentos de alza de mano (participación)

ggplot(data = data, aes(x = ParentschoolSatisfaction, y = VisITedResources)) + geom_boxplot()

ggplot(data = data, aes(x = ParentschoolSatisfaction, y = AnnouncementsView)) + geom_boxplot()

ggplot(data = data, aes(x = ParentschoolSatisfaction, y = Discussion)) + geom_boxplot()

ggplot(data = data, aes(x = StudentAbsenceDays, y = raisedhands)) + geom_boxplot()

Mayor cantidad de faltas, menor cantidad de participación

ggplot(data = data, aes(x = StudentAbsenceDays, y = VisITedResources)) + geom_boxplot()

ggplot(data = data, aes(x = StudentAbsenceDays, y = AnnouncementsView)) + geom_boxplot()

ggplot(data = data, aes(x = StudentAbsenceDays, y = Discussion)) + geom_boxplot()

ggplot(data = data, aes(x = ParentAnsweringSurvey, y = raisedhands)) + geom_boxplot()

survy answer- yes –> more hand raises

ggplot(data = data, aes(x = ParentAnsweringSurvey, y = VisITedResources)) + geom_boxplot()

ggplot(data = data, aes(x = ParentAnsweringSurvey, y = AnnouncementsView)) + geom_boxplot()

ggplot(data = data, aes(x = ParentAnsweringSurvey, y = Discussion)) + geom_boxplot()

Gráficos de caja de clase

ggplot(data = data, aes(x = Class, y = raisedhands)) + geom_boxplot()

Marcas altas -> elevaciones de mano alta

ggplot(data = data, aes(x = Class, y = VisITedResources)) + geom_boxplot()

Altas calificaciones cuando se revisan (visitan) más los recursos

ggplot(data = data, aes(x = Class, y = AnnouncementsView)) + geom_boxplot()

MEjores marcas cuando hay más vistas de anuncios

ggplot(data = data, aes(x = Class, y = Discussion)) + geom_boxplot()

Cuando hay mayores marcas hay mayores discusiones

Gráfico de dispersión

ggplot(data = data, aes( x = raisedhands, y = VisITedResources)) + geom_point() +
    geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data = data, aes( x = raisedhands, y = AnnouncementsView)) + geom_point() +
    geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data = data, aes( x = raisedhands, y = Discussion)) + geom_point() +
    geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data = data, aes( x = VisITedResources, y = AnnouncementsView)) + geom_point() +
    geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data = data, aes( x = VisITedResources, y = Discussion)) + geom_point() +
    geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data = data, aes( x = AnnouncementsView, y = Discussion)) + geom_point() +
    geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

Gráfica de densidad

ggplot(data = data, aes(x = raisedhands, color = gender)) + geom_density()

ggplot(data = data, aes(x = raisedhands, color = Topic)) + geom_density()

ggplot(data = data, aes(x = raisedhands, color = SectionID)) + geom_density()

ggplot(data = data, aes(x = raisedhands, color = Semester)) + geom_density()

ggplot(data = data, aes(x = raisedhands, color = Class)) + geom_density()

Tile Map

tile.map <- data %>% group_by(gender, NationalITy) %>%
                        summarise(Count = n()) %>% arrange(desc(Count))
## `summarise()` has grouped output by 'gender'. You can override using the
## `.groups` argument.
ggplot(data = tile.map, aes(x = gender, NationalITy, fill = Count)) + geom_tile()

Modelado predictivo

Dividir datos en conjuntos de tren y validación cruzada.

set.seed(55)
split <- sample.split(data$Class, SplitRatio = 0.75)
train <- subset(data, split == T)
cv <- subset(data, split == F)

Árbol de decisiones

tree.model <- rpart(Class ~ ., data = train, method = "class", minbucket = 1)
prp(tree.model) 

accuracy = 0.7355(minbucket = 20)

accuracy = 0.7603(minbucket = 10)

accuracy = 0.8181(minbucket = 1)

Conclusiones