## load libraries
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(class)
library(rpart)
library(rpart.plot)
library(e1071)
library(caret)
## Loading required package: lattice
library(caTools)
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
##
## Attaching package: 'party'
## The following object is masked from 'package:dplyr':
##
## where
## input
data <- read.csv("xAPI-Edu-Data.csv")
str(data)
## 'data.frame': 480 obs. of 17 variables:
## $ gender : chr "M" "M" "M" "M" ...
## $ NationalITy : chr "KW" "KW" "KW" "KW" ...
## $ PlaceofBirth : chr "KuwaIT" "KuwaIT" "KuwaIT" "KuwaIT" ...
## $ StageID : chr "lowerlevel" "lowerlevel" "lowerlevel" "lowerlevel" ...
## $ GradeID : chr "G-04" "G-04" "G-04" "G-04" ...
## $ SectionID : chr "A" "A" "A" "A" ...
## $ Topic : chr "IT" "IT" "IT" "IT" ...
## $ Semester : chr "F" "F" "F" "F" ...
## $ Relation : chr "Father" "Father" "Father" "Father" ...
## $ raisedhands : int 15 20 10 30 40 42 35 50 12 70 ...
## $ VisITedResources : int 16 20 7 25 50 30 12 10 21 80 ...
## $ AnnouncementsView : int 2 3 0 5 12 13 0 15 16 25 ...
## $ Discussion : int 20 25 30 35 50 70 17 22 50 70 ...
## $ ParentAnsweringSurvey : chr "Yes" "Yes" "No" "No" ...
## $ ParentschoolSatisfaction: chr "Good" "Good" "Bad" "Bad" ...
## $ StudentAbsenceDays : chr "Under-7" "Under-7" "Above-7" "Above-7" ...
## $ Class : chr "M" "M" "L" "L" ...
summary(data)
## gender NationalITy PlaceofBirth StageID
## Length:480 Length:480 Length:480 Length:480
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## GradeID SectionID Topic Semester
## Length:480 Length:480 Length:480 Length:480
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Relation raisedhands VisITedResources AnnouncementsView
## Length:480 Min. : 0.00 Min. : 0.0 Min. : 0.00
## Class :character 1st Qu.: 15.75 1st Qu.:20.0 1st Qu.:14.00
## Mode :character Median : 50.00 Median :65.0 Median :33.00
## Mean : 46.77 Mean :54.8 Mean :37.92
## 3rd Qu.: 75.00 3rd Qu.:84.0 3rd Qu.:58.00
## Max. :100.00 Max. :99.0 Max. :98.00
## Discussion ParentAnsweringSurvey ParentschoolSatisfaction
## Min. : 1.00 Length:480 Length:480
## 1st Qu.:20.00 Class :character Class :character
## Median :39.00 Mode :character Mode :character
## Mean :43.28
## 3rd Qu.:70.00
## Max. :99.00
## StudentAbsenceDays Class
## Length:480 Length:480
## Class :character Class :character
## Mode :character Mode :character
##
##
##
ggplot(data = data, aes(x = raisedhands)) + geom_histogram(color = "black") +
scale_x_continuous(breaks = seq(0,100,5)) +
labs(x = "Raised Hands", y = "Student Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = data, aes(x = VisITedResources)) + geom_histogram(color = "black") +
scale_x_continuous(breaks = seq(0,100,5)) +
labs(x = "Visited Resources", y = "Student Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = data, aes(x = AnnouncementsView)) + geom_histogram(color = "black") +
scale_x_continuous(breaks = seq(0,100,5)) +
labs(x = "Announcements View", y = "Student Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = data, aes(x = Discussion)) + geom_histogram(color = "black") +
scale_x_continuous(breaks = seq(0,100,5)) +
labs(x = "Discussion Participation", y = "Student Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#### En las graficas de barras anteriores podemos notar en el grafico
donde esta en el eje x “manos levantadas” podemos notar que nos
proporciona informacion acerca de la cantidad de veces que cierta
cantidad de estudiantes levantaron la mano. En el grafico donde en el
eje x esta “recursos visitados” nos brinda informacion sobre la cantidad
de estudiantes visitaron recursos. En el grafico donde en el eje x esta
“vistas de anuncios” vemos la cantidad de estudiantes que vieron cierta
cantidad de anuncios.En el grafico donde esta en el eje x “participacion
en discusiones” vemos la cantidad de estudiantes que participarion
cierta cantidad de veces en una discucion.
ggplot(data = data, aes(x = gender)) + geom_bar() +
labs(x = "Gender", y = "Student Count") +
scale_y_continuous(breaks = seq(0,300,30)) + coord_flip()
ggplot(data = data, aes(x = NationalITy)) + geom_bar() +
labs(x = "Nationality", y = "Student Count") +
scale_y_continuous(breaks = seq(0,200,20)) + coord_flip()
ggplot(data = data, aes(x = PlaceofBirth)) + geom_bar(aes(fill = NationalITy)) +
labs(x = "Birth Place", y = "Student Count") + coord_flip() # usa is a mix of nationalities
ggplot(data = data, aes(x = GradeID, fill = Class)) + geom_bar() +
labs(x = "Grade ID", y = "Student Count") + coord_flip() # g-06 has students with only low grades
ggplot(data = data, aes(x = GradeID, fill = gender)) + geom_bar() +
labs(x = "Grade ID", y = "Student Count") + coord_flip() # g-10 has no females
ggplot(data = data, aes(x = SectionID, fill = Topic)) + geom_bar() +
labs(x = "Section ID", y = "Student Count") +
coord_flip()
ggplot(data = data, aes(x = Topic, fill = gender)) + geom_bar() +
labs(x = "Topic", y = "Student Count") +
scale_y_continuous(breaks = seq(0,100,4)) + coord_flip()
ggplot(data = data, aes(x = Topic, fill = NationalITy)) + geom_bar() +
labs(x = "Topic", y = "Student Count") + coord_flip() +
scale_y_continuous(breaks = seq(0,100,4))
ggplot(data = data, aes(x = Topic, fill = StageID)) + geom_bar() +
labs(x = "Topic", y = "Student Count") + coord_flip() +
scale_y_continuous(breaks = seq(0,100,4))
ggplot(data = data, aes(x = Topic, fill = SectionID)) + geom_bar() +
labs(x = "Topic", y = "Student Count") + coord_flip() +
scale_y_continuous(breaks = seq(0,100,4))
ggplot(data = data, aes(x = Topic, fill = Semester)) + geom_bar() +
labs(x = "Topic", y = "Student Count") + coord_flip() +
scale_y_continuous(breaks = seq(0,100,4))
ggplot(data = data, aes(x = Topic, fill = Relation)) + geom_bar() +
labs(x = "Topic", y = "Student Count") + coord_flip() +
scale_y_continuous(breaks = seq(0,100,4))
ggplot(data = data, aes(x = Topic, fill = Class)) + geom_bar() +
labs(x = "Topic", y = "Student Count") + coord_flip() +
scale_y_continuous(breaks = seq(0,100,4))
ggplot(data = data, aes(x = Topic, fill = Class)) + geom_bar(position = "fill") +
labs(x = "Topic", y = "Student Count") + coord_flip() +
scale_y_continuous(breaks = seq(0,100,4))
ggplot(data = data, aes(x = Semester)) + geom_bar() +
labs(x = "Semester", y = "Student Count")
ggplot(data = data, aes(x = Relation, fill = Semester)) + geom_bar() +
labs(x = "Gaurdian", y = "Student Count")
ggplot(data = data, aes(x = ParentAnsweringSurvey, fill = ParentschoolSatisfaction)) +
geom_bar() +
labs(x = "Does parent answer survey ?", y = "Student Count")
ggplot(data = data, aes(x = ParentschoolSatisfaction)) +
geom_bar() +
labs(x = "Is the parent satified with the school ?", y = "Student Count")
ggplot(data = data, aes(x = StudentAbsenceDays)) + geom_bar() +
labs(x = "Is the student absent for more than seven days", y = "Student Count")
ggplot(data = data, aes(x = Class, fill = gender)) + geom_bar() +
labs(x = "Class", y = "Student Count")
ggplot(data = data, aes(x = Class, fill = Relation)) + geom_bar() +
labs(x = "Class", y = "Student Count")
ggplot(data = data, aes(x = Class, fill = ParentAnsweringSurvey)) + geom_bar() +
labs(x = "Class", y = "Student Count")
ggplot(data = data, aes(x = Class, fill = StudentAbsenceDays)) + geom_bar() +
labs(x = "Class", y = "Student Count")
ggplot(data = data, aes(x = gender, y = raisedhands)) + geom_boxplot()
ggplot(data = data, aes(x = gender, y = VisITedResources)) + geom_boxplot()
ggplot(data = data, aes(x = NationalITy, y = raisedhands)) + geom_boxplot()
ggplot(data = data, aes(x = StageID, y = raisedhands)) + geom_boxplot()
ggplot(data = data, aes(x = StageID, y = Discussion)) + geom_boxplot()
ggplot(data = data, aes(x = GradeID, y = raisedhands)) + geom_boxplot()
ggplot(data = data, aes(x = SectionID, y = Discussion)) + geom_boxplot()
ggplot(data = data, aes(x = Topic, y = raisedhands)) + geom_boxplot()
ggplot(data = data, aes(x = Semester, y = raisedhands)) + geom_boxplot()
ggplot(data = data, aes(x = Relation, y = raisedhands)) + geom_boxplot()
ggplot(data = data, aes(x = ParentAnsweringSurvey, y = raisedhands)) + geom_boxplot()
ggplot(data = data, aes(x = ParentAnsweringSurvey, y = VisITedResources)) + geom_boxplot()
ggplot(data = data, aes(x = ParentAnsweringSurvey, y = AnnouncementsView)) + geom_boxplot()
ggplot(data = data, aes(x = ParentAnsweringSurvey, y = Discussion)) + geom_boxplot()
ggplot(data = data, aes(x = ParentschoolSatisfaction, y = raisedhands)) + geom_boxplot()
ggplot(data = data, aes(x = ParentschoolSatisfaction, y = VisITedResources)) + geom_boxplot()
ggplot(data = data, aes(x = ParentschoolSatisfaction, y = AnnouncementsView)) + geom_boxplot()
ggplot(data = data, aes(x = ParentschoolSatisfaction, y = Discussion)) + geom_boxplot()
ggplot(data = data, aes(x = StudentAbsenceDays, y = raisedhands)) + geom_boxplot()
ggplot(data = data, aes(x = StudentAbsenceDays, y = VisITedResources)) + geom_boxplot()
ggplot(data = data, aes(x = StudentAbsenceDays, y = AnnouncementsView)) + geom_boxplot()
ggplot(data = data, aes(x = StudentAbsenceDays, y = Discussion)) + geom_boxplot()
ggplot(data = data, aes(x = ParentAnsweringSurvey, y = raisedhands)) + geom_boxplot()
ggplot(data = data, aes(x = ParentAnsweringSurvey, y = VisITedResources)) + geom_boxplot()
ggplot(data = data, aes(x = ParentAnsweringSurvey, y = AnnouncementsView)) + geom_boxplot()
ggplot(data = data, aes(x = ParentAnsweringSurvey, y = Discussion)) + geom_boxplot()
#### En el primer grafico vemos que los datos son asimetricos y que los
que respondieron “si” tienen mayot numero de recursos visitados. En el
segundo grafico vemos que los que respondieron “No” los valores son
atipicos y los que respondieron con “yes” tienen mayor numero de vistas
de anuncios. En el tercer grafico vemos que los datos son simetricos y
los que respondieron”Yes” tienen un mayor numero de discusiones.
ggplot(data = data, aes(x = Class, y = raisedhands)) + geom_boxplot()
ggplot(data = data, aes(x = Class, y = VisITedResources)) + geom_boxplot()
ggplot(data = data, aes(x = Class, y = AnnouncementsView)) + geom_boxplot()
ggplot(data = data, aes(x = Class, y = Discussion)) + geom_boxplot()
ggplot(data = data, aes( x = raisedhands, y = VisITedResources)) + geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(data = data, aes( x = raisedhands, y = AnnouncementsView)) + geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(data = data, aes( x = raisedhands, y = Discussion)) + geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(data = data, aes( x = VisITedResources, y = AnnouncementsView)) + geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(data = data, aes( x = VisITedResources, y = Discussion)) + geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(data = data, aes( x = AnnouncementsView, y = Discussion)) + geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
#### En el primer grafico de dispersion podemos observar que comparan
las variables manos levantadas en el eje “x” y recursos vistados en el
eje “y” donde no tienen ninguna correlacion entre las variables . En el
segundo grafico. En el segundo grafico tengo las variables de manos
levantadas eje “x” y anuncios vistados en el eje “y” y donde estos no
tienen ninguna correlacion entre las variables . En el tercer grafico
podemos notar que tenemos manos levantadas en “X” y discusiones en el
eje “y” y donde estos no tienen ninguna correlacion entre las variables.
En el cuarto grafico tenemos a recursos visitados en el eje “x” y
anuncios vistados en el eje “y” y donde estos no tienen ninguna
correlacion entre las variables. En el quinto grafico tenemos recursos
vistados en el eje “X” y discusiones en el eje “y” y donde estos no
tienen ninguna correlacion entre las variables. En el sexto grafico
tenemos anuncios visitados en eje “x” y discusiones en el eje “y” y
donde estas variables no tienen ninguna correlacion.
ggplot(data = data, aes(x = raisedhands, color = gender)) + geom_density()
ggplot(data = data, aes(x = raisedhands, color = Topic)) + geom_density()
ggplot(data = data, aes(x = raisedhands, color = SectionID)) + geom_density()
ggplot(data = data, aes(x = raisedhands, color = Semester)) + geom_density()
ggplot(data = data, aes(x = raisedhands, color = Class)) + geom_density()
#### En el primer grafico de densidad observamos que el genero masculino
tiene un mayor numero de densidad cuando el numero de manos levantadas
es 12,5 y las mujeres tienen una mayor densidad cuado el numero de manos
levantadas es 75. En el segundo grafico podemos observar que la clase de
geologia tiene una mayor densidad cunando el numero de manos levantadas
es 85. En el tercer grafico podemos notar que la seccion A y B presentan
las mayores densidades, pero para A es mayor cuando el numero de manos
levantadas es 77 y para B cuando es 13. En el cuarto grafico podemos
observar que en los semestre la densidad de los de primer es mayor
cuando el numero de manos levantadas es 12,5 y para los de segundo
semestre es cuando vale 77. En el quinto informe la mayor densidad la
tiene la + clase L cuando el numero de manos levantadas es 11 y esta
misma clase tiene la densidad mas pequeña cuando el numero de manos
levantadas es 20.
tile.map <- data %>% group_by(gender, NationalITy) %>%
summarise(Count = n()) %>% arrange(desc(Count))
## `summarise()` has grouped output by 'gender'. You can override using the
## `.groups` argument.
ggplot(data = tile.map, aes(x = gender, NationalITy, fill = Count)) + geom_tile()
set.seed(55)
split <- sample.split(data$Class, SplitRatio = 0.9)
train <- subset(data, split == T)
cv <- subset(data, split == F)
tree.model <- rpart(Class ~ ., data = train, method = "class", minbucket = 1)
prp(tree.model)
#### Esta matriz me indica el numero de asiertos y errores que predijo
el algoritmo.
tree.predict <- predict(tree.model, cv, type = "class")
table(cv$Class, tree.predict)
## tree.predict
## H L M
## H 12 0 2
## L 0 12 1
## M 3 2 16