#Load libraries
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(class)
library(rpart)
library(rpart.plot)
library(e1071)
library(caret)
## Loading required package: lattice
library(caTools)
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## 
## Attaching package: 'party'
## The following object is masked from 'package:dplyr':
## 
##     where
#Input===================================
data <- read.csv("C:/Users/Lejac/OneDrive/Escritorio/ANALISIS ESTADISTICO/PROYECTO #1/xAPI-Edu-Data.csv")
str(data)
## 'data.frame':    480 obs. of  17 variables:
##  $ gender                  : chr  "M" "M" "M" "M" ...
##  $ NationalITy             : chr  "KW" "KW" "KW" "KW" ...
##  $ PlaceofBirth            : chr  "KuwaIT" "KuwaIT" "KuwaIT" "KuwaIT" ...
##  $ StageID                 : chr  "lowerlevel" "lowerlevel" "lowerlevel" "lowerlevel" ...
##  $ GradeID                 : chr  "G-04" "G-04" "G-04" "G-04" ...
##  $ SectionID               : chr  "A" "A" "A" "A" ...
##  $ Topic                   : chr  "IT" "IT" "IT" "IT" ...
##  $ Semester                : chr  "F" "F" "F" "F" ...
##  $ Relation                : chr  "Father" "Father" "Father" "Father" ...
##  $ raisedhands             : int  15 20 10 30 40 42 35 50 12 70 ...
##  $ VisITedResources        : int  16 20 7 25 50 30 12 10 21 80 ...
##  $ AnnouncementsView       : int  2 3 0 5 12 13 0 15 16 25 ...
##  $ Discussion              : int  20 25 30 35 50 70 17 22 50 70 ...
##  $ ParentAnsweringSurvey   : chr  "Yes" "Yes" "No" "No" ...
##  $ ParentschoolSatisfaction: chr  "Good" "Good" "Bad" "Bad" ...
##  $ StudentAbsenceDays      : chr  "Under-7" "Under-7" "Above-7" "Above-7" ...
##  $ Class                   : chr  "M" "M" "L" "L" ...
summary (data)
##     gender          NationalITy        PlaceofBirth         StageID         
##  Length:480         Length:480         Length:480         Length:480        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    GradeID           SectionID            Topic             Semester        
##  Length:480         Length:480         Length:480         Length:480        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    Relation          raisedhands     VisITedResources AnnouncementsView
##  Length:480         Min.   :  0.00   Min.   : 0.0     Min.   : 0.00    
##  Class :character   1st Qu.: 15.75   1st Qu.:20.0     1st Qu.:14.00    
##  Mode  :character   Median : 50.00   Median :65.0     Median :33.00    
##                     Mean   : 46.77   Mean   :54.8     Mean   :37.92    
##                     3rd Qu.: 75.00   3rd Qu.:84.0     3rd Qu.:58.00    
##                     Max.   :100.00   Max.   :99.0     Max.   :98.00    
##    Discussion    ParentAnsweringSurvey ParentschoolSatisfaction
##  Min.   : 1.00   Length:480            Length:480              
##  1st Qu.:20.00   Class :character      Class :character        
##  Median :39.00   Mode  :character      Mode  :character        
##  Mean   :43.28                                                 
##  3rd Qu.:70.00                                                 
##  Max.   :99.00                                                 
##  StudentAbsenceDays    Class          
##  Length:480         Length:480        
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 

Exploratory Data Analysis

Histograms

ggplot(data = data, aes(x = raisedhands)) + geom_histogram(color = "black") +
    scale_x_continuous(breaks = seq(0,100,5)) + 
    labs(x = "Raised Hands", y = "Student Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = data, aes(x = VisITedResources)) + geom_histogram(color = "black") +
    scale_x_continuous(breaks = seq(0,100,5)) + 
    labs(x = "Visited Resources", y = "Student Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = data, aes(x = AnnouncementsView)) + geom_histogram(color = "black") +
    scale_x_continuous(breaks = seq(0,100,5)) + 
    labs(x = "Announcements View", y = "Student Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = data, aes(x = Discussion)) + geom_histogram(color = "black") +
    scale_x_continuous(breaks = seq(0,100,5)) + 
    labs(x = "Discussion Participation", y = "Student Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Barplots

ggplot(data = data, aes(x = gender)) + geom_bar() + 
    labs(x = "Gender", y = "Student Count") +
    scale_y_continuous(breaks = seq(0,300,30)) + coord_flip()

ggplot(data = data, aes(x = NationalITy)) + geom_bar() + 
    labs(x = "Nationality", y = "Student Count") +
    scale_y_continuous(breaks = seq(0,200,20)) + coord_flip()

Jordan and KW has highest number of students.

ggplot(data = data, aes(x = PlaceofBirth)) + geom_bar(aes(fill = NationalITy)) + 
    labs(x = "Birth Place", y = "Student Count") + coord_flip() # usa is a mix of nationalities

ggplot(data = data, aes(x = GradeID, fill = Class)) + geom_bar() + 
    labs(x = "Grade ID", y = "Student Count") + coord_flip() # g-06 has students with only low grades

ggplot(data = data, aes(x = GradeID, fill = gender)) + geom_bar() + 
    labs(x = "Grade ID", y = "Student Count") + coord_flip() # g-10 has no females

ggplot(data = data, aes(x = SectionID, fill = Topic)) + geom_bar() +
    labs(x = "Section ID", y = "Student Count") +
    coord_flip()

Class c has only IT and science students

ggplot(data = data, aes(x = Topic, fill = gender)) + geom_bar() +
    labs(x = "Topic", y = "Student Count") +
    scale_y_continuous(breaks = seq(0,100,4)) + coord_flip()

Spanish has worst male:female ratio whereas science, chemistry, english and french have good ratio.

ggplot(data = data, aes(x = Topic, fill = NationalITy)) + geom_bar() +
    labs(x = "Topic", y = "Student Count") + coord_flip() +
    scale_y_continuous(breaks = seq(0,100,4))

IT has most people from KW. Chemistry has least diversity. French has most diversity.

ggplot(data = data, aes(x = Topic, fill = StageID)) + geom_bar() +
    labs(x = "Topic", y = "Student Count") + coord_flip() +
    scale_y_continuous(breaks = seq(0,100,4))

Geology, biology and chemistry only middle school. French has all low level except one high level student.

ggplot(data = data, aes(x = Topic, fill = SectionID)) + geom_bar() +
    labs(x = "Topic", y = "Student Count") + coord_flip() +
    scale_y_continuous(breaks = seq(0,100,4))

Section C only has spanish and IT students.

ggplot(data = data, aes(x = Topic, fill = Semester)) + geom_bar() +
    labs(x = "Topic", y = "Student Count") + coord_flip() +
    scale_y_continuous(breaks = seq(0,100,4))

IT mostly has students who are in first semester.

ggplot(data = data, aes(x = Topic, fill = Relation)) + geom_bar() +
    labs(x = "Topic", y = "Student Count") + coord_flip() +
    scale_y_continuous(breaks = seq(0,100,4))

Most french students have mom as gaurdian whereas most IT students have fathers as gaurdian.

ggplot(data = data, aes(x = Topic, fill = Class)) + geom_bar() +
    labs(x = "Topic", y = "Student Count") + coord_flip() +
    scale_y_continuous(breaks = seq(0,100,4))

ggplot(data = data, aes(x = Topic, fill = Class)) + geom_bar(position = "fill") +
    labs(x = "Topic", y = "Student Count") + coord_flip() +
    scale_y_continuous(breaks = seq(0,100,4))

Geology has no low class students.

ggplot(data = data, aes(x = Semester)) + geom_bar() + 
    labs(x = "Semester", y = "Student Count")

ggplot(data = data, aes(x = Relation, fill = Semester)) + geom_bar() +
    labs(x = "Gaurdian", y = "Student Count")

ggplot(data = data, aes(x = ParentAnsweringSurvey, fill = ParentschoolSatisfaction)) +
    geom_bar() + 
    labs(x = "Does parent answer survey ?", y = "Student Count")

Most of the parents who aren’t satisfied with the school do not answer the survey.

ggplot(data = data, aes(x = ParentschoolSatisfaction)) +
    geom_bar() + 
    labs(x = "Is the parent satified with the school ?", y = "Student Count")

ggplot(data = data, aes(x = StudentAbsenceDays)) + geom_bar() + 
            labs(x = "Is the student absent for more than seven days", y = "Student Count")

ggplot(data = data, aes(x = Class, fill = gender)) + geom_bar() +
    labs(x = "Class", y = "Student Count")

Very few girls in low class.

ggplot(data = data, aes(x = Class, fill = Relation)) + geom_bar() +
    labs(x = "Class", y = "Student Count")

The students who have moms as gaurdians have higher chances to get high class marks.

ggplot(data = data, aes(x = Class, fill = ParentAnsweringSurvey)) + geom_bar() +
    labs(x = "Class", y = "Student Count")

Students whose parents answer the survey are the ones getting good marks.

ggplot(data = data, aes(x = Class, fill = StudentAbsenceDays)) + geom_bar() +
    labs(x = "Class", y = "Student Count")

Student getting absent are the ones getting low marks.

Boxplots

ggplot(data = data, aes(x = gender, y = raisedhands)) + geom_boxplot()

Girls have more hand raises.

ggplot(data = data, aes(x = gender, y = VisITedResources)) + geom_boxplot()

Girls visit more resources.

ggplot(data = data, aes(x = NationalITy, y = raisedhands)) + geom_boxplot()

Jordan more hand raises than KW. Lybia lowest. Iraq and Palestine highest hand raises.

ggplot(data = data, aes(x = StageID, y = raisedhands)) + geom_boxplot()

More hand raises in middle schools.

ggplot(data = data, aes(x = StageID, y = Discussion)) + geom_boxplot()

More discussions in high schools.

ggplot(data = data, aes(x = GradeID, y = raisedhands)) + geom_boxplot()

Grade 6 has most number of hand raises on an average.

ggplot(data = data, aes(x = SectionID, y = Discussion)) + geom_boxplot()

Section C with lowest discussion.

ggplot(data = data, aes(x = Topic, y = raisedhands)) + geom_boxplot()

IT has very few hand raises intersetingly although most students study there.

ggplot(data = data, aes(x = Semester, y = raisedhands)) + geom_boxplot()

Second sem –> more hand raises

ggplot(data = data, aes(x = Relation, y = raisedhands)) + geom_boxplot()

Gaurdians moter -> more hand raises

ggplot(data = data, aes(x = ParentAnsweringSurvey, y = raisedhands)) + geom_boxplot()

Survey answer-yes –> more hand raises

ggplot(data = data, aes(x = ParentAnsweringSurvey, y = VisITedResources)) + geom_boxplot()

ggplot(data = data, aes(x = ParentAnsweringSurvey, y = AnnouncementsView)) + geom_boxplot()

ggplot(data = data, aes(x = ParentAnsweringSurvey, y = Discussion)) + geom_boxplot()

ggplot(data = data, aes(x = ParentschoolSatisfaction, y = raisedhands)) + geom_boxplot()

Satisfaction-good –> more hand raises

ggplot(data = data, aes(x = ParentschoolSatisfaction, y = VisITedResources)) + geom_boxplot()

ggplot(data = data, aes(x = ParentschoolSatisfaction, y = AnnouncementsView)) + geom_boxplot()

ggplot(data = data, aes(x = ParentschoolSatisfaction, y = Discussion)) + geom_boxplot()

ggplot(data = data, aes(x = StudentAbsenceDays, y = raisedhands)) + geom_boxplot()

More leaves less hand raises.

ggplot(data = data, aes(x = StudentAbsenceDays, y = VisITedResources)) + geom_boxplot()

ggplot(data = data, aes(x = StudentAbsenceDays, y = AnnouncementsView)) + geom_boxplot()

ggplot(data = data, aes(x = StudentAbsenceDays, y = Discussion)) + geom_boxplot()

ggplot(data = data, aes(x = ParentAnsweringSurvey, y = raisedhands)) + geom_boxplot()

Survy answer- yes –> more hand raises

ggplot(data = data, aes(x = ParentAnsweringSurvey, y = VisITedResources)) + geom_boxplot()

ggplot(data = data, aes(x = ParentAnsweringSurvey, y = AnnouncementsView)) + geom_boxplot()

ggplot(data = data, aes(x = ParentAnsweringSurvey, y = Discussion)) + geom_boxplot()

##Class-wise boxplots

ggplot(data = data, aes(x = Class, y = raisedhands)) + geom_boxplot()

High marks –> high hand raises

ggplot(data = data, aes(x = Class, y = VisITedResources)) + geom_boxplot()

High marks –> visited resources

ggplot(data = data, aes(x = Class, y = AnnouncementsView)) + geom_boxplot()

More marks more announcements views

ggplot(data = data, aes(x = Class, y = Discussion)) + geom_boxplot()

High marks more discussion

Scatterplots

ggplot(data = data, aes( x = raisedhands, y = VisITedResources)) + geom_point() +
    geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data = data, aes( x = raisedhands, y = AnnouncementsView)) + geom_point() +
    geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data = data, aes( x = raisedhands, y = Discussion)) + geom_point() +
    geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data = data, aes( x = VisITedResources, y = AnnouncementsView)) + geom_point() +
    geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data = data, aes( x = VisITedResources, y = Discussion)) + geom_point() +
    geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data = data, aes( x = AnnouncementsView, y = Discussion)) + geom_point() +
    geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

Density Plots

ggplot(data = data, aes(x = raisedhands, color = gender)) + geom_density()

ggplot(data = data, aes(x = raisedhands, color = Topic)) + geom_density()

ggplot(data = data, aes(x = raisedhands, color = SectionID)) + geom_density()

ggplot(data = data, aes(x = raisedhands, color = Semester)) + geom_density()

ggplot(data = data, aes(x = raisedhands, color = Class)) + geom_density()

Tile map

tile.map <- data %>% group_by(gender, NationalITy) %>%
                        summarise(Count = n()) %>% arrange(desc(Count))
## `summarise()` has grouped output by 'gender'. You can override using the
## `.groups` argument.
ggplot(data = tile.map, aes(x = gender, NationalITy, fill = Count)) + geom_tile()

Predictive modelling

Splitting data into train and cross-validation sets.

set.seed(55)
split <- sample.split(data$Class, SplitRatio = 0.75)
train <- subset(data, split == T)
cv <- subset(data, split == F)

Decision trees

tree.model <- rpart(Class ~ ., data = train, method = "class", minbucket = 1)
prp(tree.model)