Inicio de la presentación

Descarga de las funciones a usar

system("ls ../input")

## load libraries ============================================================================

library(ggplot2)
library(dplyr)
library(randomForest)
library(class)
library(rpart)
library(rpart.plot)
library(e1071)
library(caret)
library(caTools)
library(party)

## input =====================================================================================

data <- read.csv("xAPI-Edu-Data.csv")
str(data)
## 'data.frame':    480 obs. of  17 variables:
##  $ gender                  : chr  "M" "M" "M" "M" ...
##  $ NationalITy             : chr  "KW" "KW" "KW" "KW" ...
##  $ PlaceofBirth            : chr  "KuwaIT" "KuwaIT" "KuwaIT" "KuwaIT" ...
##  $ StageID                 : chr  "lowerlevel" "lowerlevel" "lowerlevel" "lowerlevel" ...
##  $ GradeID                 : chr  "G-04" "G-04" "G-04" "G-04" ...
##  $ SectionID               : chr  "A" "A" "A" "A" ...
##  $ Topic                   : chr  "IT" "IT" "IT" "IT" ...
##  $ Semester                : chr  "F" "F" "F" "F" ...
##  $ Relation                : chr  "Father" "Father" "Father" "Father" ...
##  $ raisedhands             : int  15 20 10 30 40 42 35 50 12 70 ...
##  $ VisITedResources        : int  16 20 7 25 50 30 12 10 21 80 ...
##  $ AnnouncementsView       : int  2 3 0 5 12 13 0 15 16 25 ...
##  $ Discussion              : int  20 25 30 35 50 70 17 22 50 70 ...
##  $ ParentAnsweringSurvey   : chr  "Yes" "Yes" "No" "No" ...
##  $ ParentschoolSatisfaction: chr  "Good" "Good" "Bad" "Bad" ...
##  $ StudentAbsenceDays      : chr  "Under-7" "Under-7" "Above-7" "Above-7" ...
##  $ Class                   : chr  "M" "M" "L" "L" ...
summary(data)
##     gender          NationalITy        PlaceofBirth         StageID         
##  Length:480         Length:480         Length:480         Length:480        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    GradeID           SectionID            Topic             Semester        
##  Length:480         Length:480         Length:480         Length:480        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    Relation          raisedhands     VisITedResources AnnouncementsView
##  Length:480         Min.   :  0.00   Min.   : 0.0     Min.   : 0.00    
##  Class :character   1st Qu.: 15.75   1st Qu.:20.0     1st Qu.:14.00    
##  Mode  :character   Median : 50.00   Median :65.0     Median :33.00    
##                     Mean   : 46.77   Mean   :54.8     Mean   :37.92    
##                     3rd Qu.: 75.00   3rd Qu.:84.0     3rd Qu.:58.00    
##                     Max.   :100.00   Max.   :99.0     Max.   :98.00    
##    Discussion    ParentAnsweringSurvey ParentschoolSatisfaction
##  Min.   : 1.00   Length:480            Length:480              
##  1st Qu.:20.00   Class :character      Class :character        
##  Median :39.00   Mode  :character      Mode  :character        
##  Mean   :43.28                                                 
##  3rd Qu.:70.00                                                 
##  Max.   :99.00                                                 
##  StudentAbsenceDays    Class          
##  Length:480         Length:480        
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 

Analisis exploratorio de datos

Histogramas

ggplot(data = data, aes(x = raisedhands)) + geom_histogram(color = "black") + 
    scale_x_continuous(breaks = seq(0,100,5)) + 
    labs(x = "Raised Hands", y = "Student Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = data, aes(x = VisITedResources)) + geom_histogram(color = "black") +
    scale_x_continuous(breaks = seq(0,100,5)) + 
    labs(x = "Visited Resources", y = "Student Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = data, aes(x = AnnouncementsView)) + geom_histogram(color = "black") +
    scale_x_continuous(breaks = seq(0,100,5)) + 
    labs(x = "Announcements View", y = "Student Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = data, aes(x = Discussion)) + geom_histogram(color = "black") +
    scale_x_continuous(breaks = seq(0,100,5)) + 
    labs(x = "Discussion Participation", y = "Student Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Graficos de barras

ggplot(data = data, aes(x = gender)) + geom_bar() + 
    labs(x = "Gender", y = "Student Count") +
    scale_y_continuous(breaks = seq(0,300,30)) + coord_flip()

ggplot(data = data, aes(x = NationalITy)) + geom_bar() + 
    labs(x = "Nationality", y = "Student Count") +
    scale_y_continuous(breaks = seq(0,200,20)) + coord_flip()

Jordania y Kw tienen el mayor numero de estudiantes

ggplot(data = data, aes(x = PlaceofBirth)) + geom_bar(aes(fill = NationalITy)) + 
    labs(x = "Birth Place", y = "Student Count") + coord_flip() # usa is a mix of nationalities

ggplot(data = data, aes(x = GradeID, fill = Class)) + geom_bar() + 
    labs(x = "Grade ID", y = "Student Count") + coord_flip() # g-06 has students with only low grades

ggplot(data = data, aes(x = GradeID, fill = gender)) + geom_bar() + 
    labs(x = "Grade ID", y = "Student Count") + coord_flip() # g-10 has no females

ggplot(data = data, aes(x = SectionID, fill = Topic)) + geom_bar() +
    labs(x = "Section ID", y = "Student Count") +
    coord_flip()

la clase c solo tiene estudiantes de IT y ciencias

ggplot(data = data, aes(x = Topic, fill = gender)) + geom_bar() +
    labs(x = "Topic", y = "Student Count") +
    scale_y_continuous(breaks = seq(0,100,4)) + coord_flip()

Español tiene el peor relacion hombre-mujer mientras que ciencia, quimica, ingles y francés tienen una buena relación

ggplot(data = data, aes(x = Topic, fill = NationalITy)) + geom_bar() +
    labs(x = "Topic", y = "Student Count") + coord_flip() +
    scale_y_continuous(breaks = seq(0,100,4))

La mayoria de las personas de IT son de Kw, quimica es la que posee menos diversidad y francés la que tiene mayor.

ggplot(data = data, aes(x = Topic, fill = StageID)) + geom_bar() +
    labs(x = "Topic", y = "Student Count") + coord_flip() +
    scale_y_continuous(breaks = seq(0,100,4))

Geología, biología y quimica solo tienen estudantes de escuela media y francés tiene todos los estudiantes en nivel bajo menos por uno.

ggplot(data = data, aes(x = Topic, fill = SectionID)) + geom_bar() +
    labs(x = "Topic", y = "Student Count") + coord_flip() +
    scale_y_continuous(breaks = seq(0,100,4))

La sección c solo tiene estudiantes de español y IT

ggplot(data = data, aes(x = Topic, fill = Semester)) + geom_bar() +
    labs(x = "Topic", y = "Student Count") + coord_flip() +
    scale_y_continuous(breaks = seq(0,100,4))

IT tiene la mayoria de los estudiantes en primer semestre.

ggplot(data = data, aes(x = Topic, fill = Relation)) + geom_bar() +
    labs(x = "Topic", y = "Student Count") + coord_flip() +
    scale_y_continuous(breaks = seq(0,100,4))

La mayoria de los estudiantes de francés tiene a su madre como tutora mientras que la mayoria de estudiantes de IT tiene a su padre como tutor

ggplot(data = data, aes(x = Topic, fill = Class)) + geom_bar() +
    labs(x = "Topic", y = "Student Count") + coord_flip() +
    scale_y_continuous(breaks = seq(0,100,4))

ggplot(data = data, aes(x = Topic, fill = Class)) + geom_bar(position = "fill") +
    labs(x = "Topic", y = "Student Count") + coord_flip() +
    scale_y_continuous(breaks = seq(0,100,4))

Geología no tiene estudiantes de clases bajas

ggplot(data = data, aes(x = Semester)) + geom_bar() + 
    labs(x = "Semester", y = "Student Count")

ggplot(data = data, aes(x = Relation, fill = Semester)) + geom_bar() +
    labs(x = "Gaurdian", y = "Student Count")

ggplot(data = data, aes(x = ParentAnsweringSurvey, fill = ParentschoolSatisfaction)) +
    geom_bar() + 
    labs(x = "Does parent answer survey ?", y = "Student Count")

La mayoria de los padres que no estan satisfechos con la escuela no respondieron la encuesta.

ggplot(data = data, aes(x = ParentschoolSatisfaction)) +
    geom_bar() + 
    labs(x = "Is the parent satified with the school ?", y = "Student Count")

ggplot(data = data, aes(x = StudentAbsenceDays)) + geom_bar() + 
            labs(x = "Is the student absent for more than seven days", y = "Student Count")

ggplot(data = data, aes(x = Class, fill = gender)) + geom_bar() +
    labs(x = "Class", y = "Student Count")

Muy pocas chicas en clases bajas

ggplot(data = data, aes(x = Class, fill = Relation)) + geom_bar() +
    labs(x = "Class", y = "Student Count")

Los estudiantes que tienen a madres como tutoras tienen mas posibilidades de obtener mayores notas.

ggplot(data = data, aes(x = Class, fill = ParentAnsweringSurvey)) + geom_bar() +
    labs(x = "Class", y = "Student Count")

Los estudiantes cuyos padres respondieron la encuesta son los que poseen buenas notas.

ggplot(data = data, aes(x = Class, fill = StudentAbsenceDays)) + geom_bar() +
    labs(x = "Class", y = "Student Count")

Estudiantes que faltan son los que tienen menores notas.

Diagramas de caja

ggplot(data = data, aes(x = gender, y = raisedhands)) + geom_boxplot()

Las chicas son las que levantan mas la mano.

ggplot(data = data, aes(x = gender, y = VisITedResources)) + geom_boxplot()

Las chicas visitaron mas recursos academicos

ggplot(data = data, aes(x = NationalITy, y = raisedhands)) + geom_boxplot()

Jordania tiene mas manoos levantadas que Kw, Libia es el que menos tiene e Irak y Palestina son los que mas tienen.

ggplot(data = data, aes(x = StageID, y = raisedhands)) + geom_boxplot()

Los de la escuela media levantan mas la mano.

ggplot(data = data, aes(x = StageID, y = Discussion)) + geom_boxplot()

Hay mas discuciones en la preparatoria.

ggplot(data = data, aes(x = GradeID, y = raisedhands)) + geom_boxplot()

El sexto grado tiene la mayoria de manos levantadas en promedio.

ggplot(data = data, aes(x = SectionID, y = Discussion)) + geom_boxplot()

La seccion C tiene las menores discuciones

ggplot(data = data, aes(x = Topic, y = raisedhands)) + geom_boxplot()

IT tiene muy pocas manos levantadas a pesar de que la mayoria de los estudiantes estudian ahí.

ggplot(data = data, aes(x = Semester, y = raisedhands)) + geom_boxplot()

El segundo semestre es el que mas levannta la mano.

ggplot(data = data, aes(x = Relation, y = raisedhands)) + geom_boxplot()

Los estudiantes con madres tutoras levantan mas la mano.

ggplot(data = data, aes(x = ParentAnsweringSurvey, y = raisedhands)) + geom_boxplot()

Los que respondieron que si a la encuesta levantan mas la mano.

ggplot(data = data, aes(x = ParentAnsweringSurvey, y = VisITedResources)) + geom_boxplot()

ggplot(data = data, aes(x = ParentAnsweringSurvey, y = AnnouncementsView)) + geom_boxplot()

ggplot(data = data, aes(x = ParentAnsweringSurvey, y = Discussion)) + geom_boxplot()

ggplot(data = data, aes(x = ParentschoolSatisfaction, y = raisedhands)) + geom_boxplot()

Los que repondieron a la satisfaccion como bien levantan mas la mano.

ggplot(data = data, aes(x = ParentschoolSatisfaction, y = VisITedResources)) + geom_boxplot()

ggplot(data = data, aes(x = ParentschoolSatisfaction, y = AnnouncementsView)) + geom_boxplot()

ggplot(data = data, aes(x = ParentschoolSatisfaction, y = Discussion)) + geom_boxplot()

ggplot(data = data, aes(x = StudentAbsenceDays, y = raisedhands)) + geom_boxplot()

Entre mas se van del salon menos levantan las manos.

ggplot(data = data, aes(x = StudentAbsenceDays, y = VisITedResources)) + geom_boxplot()

ggplot(data = data, aes(x = StudentAbsenceDays, y = AnnouncementsView)) + geom_boxplot()

ggplot(data = data, aes(x = StudentAbsenceDays, y = Discussion)) + geom_boxplot()

ggplot(data = data, aes(x = ParentAnsweringSurvey, y = raisedhands)) + geom_boxplot()

Si respondieron a la encuesta que si levantan mas la mano.

ggplot(data = data, aes(x = ParentAnsweringSurvey, y = VisITedResources)) + geom_boxplot()

ggplot(data = data, aes(x = ParentAnsweringSurvey, y = AnnouncementsView)) + geom_boxplot()

ggplot(data = data, aes(x = ParentAnsweringSurvey, y = Discussion)) + geom_boxplot()

Graficos de cajas por clases

ggplot(data = data, aes(x = Class, y = raisedhands)) + geom_boxplot()

Entre mejores notas tengan mas levantan la mano.

ggplot(data = data, aes(x = Class, y = VisITedResources)) + geom_boxplot()

Entre mejores notas tenga mas recursos academicos tienen.

ggplot(data = data, aes(x = Class, y = AnnouncementsView)) + geom_boxplot()

Mejores notas mas vistas de anuncios

ggplot(data = data, aes(x = Class, y = Discussion)) + geom_boxplot()

Mayores notas mas discusiones.

Graficos de dispersión

ggplot(data = data, aes( x = raisedhands, y = VisITedResources)) + geom_point() +
    geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data = data, aes( x = raisedhands, y = AnnouncementsView)) + geom_point() +
    geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data = data, aes( x = raisedhands, y = Discussion)) + geom_point() +
    geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data = data, aes( x = VisITedResources, y = AnnouncementsView)) + geom_point() +
    geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data = data, aes( x = VisITedResources, y = Discussion)) + geom_point() +
    geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data = data, aes( x = AnnouncementsView, y = Discussion)) + geom_point() +
    geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

Graficos de densidad

ggplot(data = data, aes(x = raisedhands, color = gender)) + geom_density()

ggplot(data = data, aes(x = raisedhands, color = Topic)) + geom_density()

ggplot(data = data, aes(x = raisedhands, color = SectionID)) + geom_density()

ggplot(data = data, aes(x = raisedhands, color = Semester)) + geom_density()

ggplot(data = data, aes(x = raisedhands, color = Class)) + geom_density()

Tile Map

tile.map <- data %>% group_by(gender, NationalITy) %>%
                        summarise(Count = n()) %>% arrange(desc(Count))
## `summarise()` has grouped output by 'gender'. You can override using the
## `.groups` argument.
ggplot(data = tile.map, aes(x = gender, NationalITy, fill = Count)) + geom_tile()

Modelos predictivos

Dividir los datos en conjuntos de entrenamiento y validación cruzada.

set.seed(55)
split <- sample.split(data$Class, SplitRatio = 0.90)
train <- subset(data, split == T)
cv <- subset(data, split == F)

Arboles de decisión

tree.model <- rpart(Class ~ ., data = train, method = "class", minbucket = 1)
prp(tree.model)

tree.predict <- predict(tree.model, cv, type = "class")
table(cv$Class, tree.predict)
##    tree.predict
##      H  L  M
##   H 12  0  2
##   L  0 12  1
##   M  3  2 16

accuracy = 0.7355(minbucket = 20)

accuracy = 0.7603(minbucket = 10)

accuracy = 0.8181(minbucket = 1)

Fin de la presentación.