#Load libraries
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(class)
library(rpart)
library(rpart.plot)
library(e1071)
library(caret)
## Loading required package: lattice
library(caTools)
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
##
## Attaching package: 'party'
## The following object is masked from 'package:dplyr':
##
## where
#Input===================================
data <- read.csv("C:/Users/Lejac/OneDrive/Escritorio/ANALISIS ESTADISTICO/PROYECTO #1/xAPI-Edu-Data.csv")
str(data)
## 'data.frame': 480 obs. of 17 variables:
## $ gender : chr "M" "M" "M" "M" ...
## $ NationalITy : chr "KW" "KW" "KW" "KW" ...
## $ PlaceofBirth : chr "KuwaIT" "KuwaIT" "KuwaIT" "KuwaIT" ...
## $ StageID : chr "lowerlevel" "lowerlevel" "lowerlevel" "lowerlevel" ...
## $ GradeID : chr "G-04" "G-04" "G-04" "G-04" ...
## $ SectionID : chr "A" "A" "A" "A" ...
## $ Topic : chr "IT" "IT" "IT" "IT" ...
## $ Semester : chr "F" "F" "F" "F" ...
## $ Relation : chr "Father" "Father" "Father" "Father" ...
## $ raisedhands : int 15 20 10 30 40 42 35 50 12 70 ...
## $ VisITedResources : int 16 20 7 25 50 30 12 10 21 80 ...
## $ AnnouncementsView : int 2 3 0 5 12 13 0 15 16 25 ...
## $ Discussion : int 20 25 30 35 50 70 17 22 50 70 ...
## $ ParentAnsweringSurvey : chr "Yes" "Yes" "No" "No" ...
## $ ParentschoolSatisfaction: chr "Good" "Good" "Bad" "Bad" ...
## $ StudentAbsenceDays : chr "Under-7" "Under-7" "Above-7" "Above-7" ...
## $ Class : chr "M" "M" "L" "L" ...
summary (data)
## gender NationalITy PlaceofBirth StageID
## Length:480 Length:480 Length:480 Length:480
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## GradeID SectionID Topic Semester
## Length:480 Length:480 Length:480 Length:480
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Relation raisedhands VisITedResources AnnouncementsView
## Length:480 Min. : 0.00 Min. : 0.0 Min. : 0.00
## Class :character 1st Qu.: 15.75 1st Qu.:20.0 1st Qu.:14.00
## Mode :character Median : 50.00 Median :65.0 Median :33.00
## Mean : 46.77 Mean :54.8 Mean :37.92
## 3rd Qu.: 75.00 3rd Qu.:84.0 3rd Qu.:58.00
## Max. :100.00 Max. :99.0 Max. :98.00
## Discussion ParentAnsweringSurvey ParentschoolSatisfaction
## Min. : 1.00 Length:480 Length:480
## 1st Qu.:20.00 Class :character Class :character
## Median :39.00 Mode :character Mode :character
## Mean :43.28
## 3rd Qu.:70.00
## Max. :99.00
## StudentAbsenceDays Class
## Length:480 Length:480
## Class :character Class :character
## Mode :character Mode :character
##
##
##
ggplot(data = data, aes(x = raisedhands)) + geom_histogram(color = "black") +
scale_x_continuous(breaks = seq(0,100,5)) +
labs(x = "Raised Hands", y = "Student Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = data, aes(x = VisITedResources)) + geom_histogram(color = "black") +
scale_x_continuous(breaks = seq(0,100,5)) +
labs(x = "Visited Resources", y = "Student Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = data, aes(x = AnnouncementsView)) + geom_histogram(color = "black") +
scale_x_continuous(breaks = seq(0,100,5)) +
labs(x = "Announcements View", y = "Student Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = data, aes(x = Discussion)) + geom_histogram(color = "black") +
scale_x_continuous(breaks = seq(0,100,5)) +
labs(x = "Discussion Participation", y = "Student Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = data, aes(x = gender)) + geom_bar() +
labs(x = "Gender", y = "Student Count") +
scale_y_continuous(breaks = seq(0,300,30)) + coord_flip()
ggplot(data = data, aes(x = NationalITy)) + geom_bar() +
labs(x = "Nationality", y = "Student Count") +
scale_y_continuous(breaks = seq(0,200,20)) + coord_flip()
Jordan and KW has highest number of students.
ggplot(data = data, aes(x = PlaceofBirth)) + geom_bar(aes(fill = NationalITy)) +
labs(x = "Birth Place", y = "Student Count") + coord_flip() # usa is a mix of nationalities
ggplot(data = data, aes(x = GradeID, fill = Class)) + geom_bar() +
labs(x = "Grade ID", y = "Student Count") + coord_flip() # g-06 has students with only low grades
ggplot(data = data, aes(x = GradeID, fill = gender)) + geom_bar() +
labs(x = "Grade ID", y = "Student Count") + coord_flip() # g-10 has no females
ggplot(data = data, aes(x = SectionID, fill = Topic)) + geom_bar() +
labs(x = "Section ID", y = "Student Count") +
coord_flip()
Class c has only IT and science students
ggplot(data = data, aes(x = Topic, fill = gender)) + geom_bar() +
labs(x = "Topic", y = "Student Count") +
scale_y_continuous(breaks = seq(0,100,4)) + coord_flip()
Spanish has worst male:female ratio whereas science, chemistry, english and french have good ratio.
ggplot(data = data, aes(x = Topic, fill = NationalITy)) + geom_bar() +
labs(x = "Topic", y = "Student Count") + coord_flip() +
scale_y_continuous(breaks = seq(0,100,4))
IT has most people from KW. Chemistry has least diversity. French has most diversity.
ggplot(data = data, aes(x = Topic, fill = StageID)) + geom_bar() +
labs(x = "Topic", y = "Student Count") + coord_flip() +
scale_y_continuous(breaks = seq(0,100,4))
Geology, biology and chemistry only middle school. French has all low level except one high level student.
ggplot(data = data, aes(x = Topic, fill = SectionID)) + geom_bar() +
labs(x = "Topic", y = "Student Count") + coord_flip() +
scale_y_continuous(breaks = seq(0,100,4))
Section C only has spanish and IT students.
ggplot(data = data, aes(x = Topic, fill = Semester)) + geom_bar() +
labs(x = "Topic", y = "Student Count") + coord_flip() +
scale_y_continuous(breaks = seq(0,100,4))
IT mostly has students who are in first semester.
ggplot(data = data, aes(x = Topic, fill = Relation)) + geom_bar() +
labs(x = "Topic", y = "Student Count") + coord_flip() +
scale_y_continuous(breaks = seq(0,100,4))
Most french students have mom as gaurdian whereas most IT students have fathers as gaurdian.
ggplot(data = data, aes(x = Topic, fill = Class)) + geom_bar() +
labs(x = "Topic", y = "Student Count") + coord_flip() +
scale_y_continuous(breaks = seq(0,100,4))
ggplot(data = data, aes(x = Topic, fill = Class)) + geom_bar(position = "fill") +
labs(x = "Topic", y = "Student Count") + coord_flip() +
scale_y_continuous(breaks = seq(0,100,4))
Geology has no low class students.
ggplot(data = data, aes(x = Semester)) + geom_bar() +
labs(x = "Semester", y = "Student Count")
ggplot(data = data, aes(x = Relation, fill = Semester)) + geom_bar() +
labs(x = "Gaurdian", y = "Student Count")
ggplot(data = data, aes(x = ParentAnsweringSurvey, fill = ParentschoolSatisfaction)) +
geom_bar() +
labs(x = "Does parent answer survey ?", y = "Student Count")
Most of the parents who aren’t satisfied with the school do not answer the survey.
ggplot(data = data, aes(x = ParentschoolSatisfaction)) +
geom_bar() +
labs(x = "Is the parent satified with the school ?", y = "Student Count")
ggplot(data = data, aes(x = StudentAbsenceDays)) + geom_bar() +
labs(x = "Is the student absent for more than seven days", y = "Student Count")
ggplot(data = data, aes(x = Class, fill = gender)) + geom_bar() +
labs(x = "Class", y = "Student Count")
Very few girls in low class.
ggplot(data = data, aes(x = Class, fill = Relation)) + geom_bar() +
labs(x = "Class", y = "Student Count")
The students who have moms as gaurdians have higher chances to get high class marks.
ggplot(data = data, aes(x = Class, fill = ParentAnsweringSurvey)) + geom_bar() +
labs(x = "Class", y = "Student Count")
Students whose parents answer the survey are the ones getting good marks.
ggplot(data = data, aes(x = Class, fill = StudentAbsenceDays)) + geom_bar() +
labs(x = "Class", y = "Student Count")
Student getting absent are the ones getting low marks.
ggplot(data = data, aes(x = gender, y = raisedhands)) + geom_boxplot()
Girls have more hand raises.
ggplot(data = data, aes(x = gender, y = VisITedResources)) + geom_boxplot()
Girls visit more resources.
ggplot(data = data, aes(x = NationalITy, y = raisedhands)) + geom_boxplot()
Jordan more hand raises than KW. Lybia lowest. Iraq and Palestine highest hand raises.
ggplot(data = data, aes(x = StageID, y = raisedhands)) + geom_boxplot()
More hand raises in middle schools.
ggplot(data = data, aes(x = StageID, y = Discussion)) + geom_boxplot()
More discussions in high schools.
ggplot(data = data, aes(x = GradeID, y = raisedhands)) + geom_boxplot()
Grade 6 has most number of hand raises on an average.
ggplot(data = data, aes(x = SectionID, y = Discussion)) + geom_boxplot()
Section C with lowest discussion.
ggplot(data = data, aes(x = Topic, y = raisedhands)) + geom_boxplot()
IT has very few hand raises intersetingly although most students study there.
ggplot(data = data, aes(x = Semester, y = raisedhands)) + geom_boxplot()
Second sem –> more hand raises
ggplot(data = data, aes(x = Relation, y = raisedhands)) + geom_boxplot()
Gaurdians moter -> more hand raises
ggplot(data = data, aes(x = ParentAnsweringSurvey, y = raisedhands)) + geom_boxplot()
Survey answer-yes –> more hand raises
ggplot(data = data, aes(x = ParentAnsweringSurvey, y = VisITedResources)) + geom_boxplot()
ggplot(data = data, aes(x = ParentAnsweringSurvey, y = AnnouncementsView)) + geom_boxplot()
ggplot(data = data, aes(x = ParentAnsweringSurvey, y = Discussion)) + geom_boxplot()
ggplot(data = data, aes(x = ParentschoolSatisfaction, y = raisedhands)) + geom_boxplot()
Satisfaction-good –> more hand raises
ggplot(data = data, aes(x = ParentschoolSatisfaction, y = VisITedResources)) + geom_boxplot()
ggplot(data = data, aes(x = ParentschoolSatisfaction, y = AnnouncementsView)) + geom_boxplot()
ggplot(data = data, aes(x = ParentschoolSatisfaction, y = Discussion)) + geom_boxplot()
ggplot(data = data, aes(x = StudentAbsenceDays, y = raisedhands)) + geom_boxplot()
More leaves less hand raises.
ggplot(data = data, aes(x = StudentAbsenceDays, y = VisITedResources)) + geom_boxplot()
ggplot(data = data, aes(x = StudentAbsenceDays, y = AnnouncementsView)) + geom_boxplot()
ggplot(data = data, aes(x = StudentAbsenceDays, y = Discussion)) + geom_boxplot()
ggplot(data = data, aes(x = ParentAnsweringSurvey, y = raisedhands)) + geom_boxplot()
Survy answer- yes –> more hand raises
ggplot(data = data, aes(x = ParentAnsweringSurvey, y = VisITedResources)) + geom_boxplot()
ggplot(data = data, aes(x = ParentAnsweringSurvey, y = AnnouncementsView)) + geom_boxplot()
ggplot(data = data, aes(x = ParentAnsweringSurvey, y = Discussion)) + geom_boxplot()
##Class-wise boxplots
ggplot(data = data, aes(x = Class, y = raisedhands)) + geom_boxplot()
High marks –> high hand raises
ggplot(data = data, aes(x = Class, y = VisITedResources)) + geom_boxplot()
High marks –> visited resources
ggplot(data = data, aes(x = Class, y = AnnouncementsView)) + geom_boxplot()
More marks more announcements views
ggplot(data = data, aes(x = Class, y = Discussion)) + geom_boxplot()
High marks more discussion
ggplot(data = data, aes( x = raisedhands, y = VisITedResources)) + geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(data = data, aes( x = raisedhands, y = AnnouncementsView)) + geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(data = data, aes( x = raisedhands, y = Discussion)) + geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(data = data, aes( x = VisITedResources, y = AnnouncementsView)) + geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(data = data, aes( x = VisITedResources, y = Discussion)) + geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(data = data, aes( x = AnnouncementsView, y = Discussion)) + geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(data = data, aes(x = raisedhands, color = gender)) + geom_density()
ggplot(data = data, aes(x = raisedhands, color = Topic)) + geom_density()
ggplot(data = data, aes(x = raisedhands, color = SectionID)) + geom_density()
ggplot(data = data, aes(x = raisedhands, color = Semester)) + geom_density()
ggplot(data = data, aes(x = raisedhands, color = Class)) + geom_density()
tile.map <- data %>% group_by(gender, NationalITy) %>%
summarise(Count = n()) %>% arrange(desc(Count))
## `summarise()` has grouped output by 'gender'. You can override using the
## `.groups` argument.
ggplot(data = tile.map, aes(x = gender, NationalITy, fill = Count)) + geom_tile()
set.seed(55)
split <- sample.split(data$Class, SplitRatio = 0.75)
train <- subset(data, split == T)
cv <- subset(data, split == F)
tree.model <- rpart(Class ~ ., data = train, method = "class", minbucket = 1)
prp(tree.model)