# Filter
is_correct %>%
filter(topic=='28452643' | topic == '53038110' | topic=='173821568')
## user_id session_id session_no topic sub_topic learning_node
## 1 874390028 9139494762 1 173821568 5665086549 9595128676
## 2 2487306790 9151419864 1 53038110 3335170278 3639410144
## 3 2671732382 8267831852 5 53038110 5200191480 9373062785
## 4 2726714781 6837889277 1 53038110 3335170278 2557121605
## 5 3039027676 6617483178 3 173821568 5665086549 9595128676
## 6 5105146951 6208711696 3 173821568 5665086549 9595128676
## 7 5288067940 3307982526 9 173821568 5665086549 9595128676
## 8 5412865919 72141290 1 173821568 5665086549 9595128676
## 9 5663543357 1144964174 4 28452643 5665086549 2411866361
## 10 6339610957 3865314666 20 28452643 5665086549 6192822276
## 11 6434646443 8570039313 1 53038110 4574090690 3233119321
## 12 6599726005 3196085126 1 173821568 5665086549 9595128676
## 13 7604098932 4153886993 1 53038110 3335170278 4686251457
## 14 8579827264 8088725052 1 28452643 5665086549 5973815917
## 15 9337656425 7331968311 3 28452643 5665086549 5973815917
## question_id question_type session_question_no learning_node_question_no
## 1 9053675445 Single choice 2 2
## 2 8061555455 Single choice 14 2
## 3 2349677010 Single choice 14 2
## 4 8303653933 Single choice 5 1
## 5 8921810238 Single choice 2 2
## 6 9053675445 Single choice 2 2
## 7 9053675445 Single choice 2 2
## 8 9053675445 Single choice 2 2
## 9 2574764738 Single choice 7 1
## 10 1790818228 Single choice 10 2
## 11 1093066137 Single choice 14 2
## 12 8921810238 Single choice 2 2
## 13 3158312426 Single choice 2 2
## 14 4240925782 Single choice 4 2
## 15 4240925782 Single choice 1 1
## question_difficulty question_number_of_choice
## 1 4
## 2 medium 5
## 3 medium 5
## 4 medium 5
## 5 4
## 6 4
## 7 4
## 8 4
## 9 4
## 10 4
## 11 medium 5
## 12 4
## 13 medium 5
## 14 4
## 15 4
## question_number_of_correct_choice row_id is_correct
## 1 1 613 0.86499660
## 2 1 1761 0.45501879
## 3 1 1889 0.48509344
## 4 1 1934 0.48844802
## 5 1 2163 0.59751272
## 6 1 3593 0.14251575
## 7 1 3729 0.70881678
## 8 1 3816 0.90210946
## 9 1 3985 0.02520149
## 10 1 4457 0.18406078
## 11 1 4531 0.34929711
## 12 1 4647 0.21317235
## 13 1 5358 0.69298546
## 14 1 6042 0.33414826
## 15 1 6540 0.62240432
is_correct %>%
filter(topic=='53038110')
## user_id session_id session_no topic sub_topic learning_node
## 1 2487306790 9151419864 1 53038110 3335170278 3639410144
## 2 2671732382 8267831852 5 53038110 5200191480 9373062785
## 3 2726714781 6837889277 1 53038110 3335170278 2557121605
## 4 6434646443 8570039313 1 53038110 4574090690 3233119321
## 5 7604098932 4153886993 1 53038110 3335170278 4686251457
## question_id question_type session_question_no learning_node_question_no
## 1 8061555455 Single choice 14 2
## 2 2349677010 Single choice 14 2
## 3 8303653933 Single choice 5 1
## 4 1093066137 Single choice 14 2
## 5 3158312426 Single choice 2 2
## question_difficulty question_number_of_choice
## 1 medium 5
## 2 medium 5
## 3 medium 5
## 4 medium 5
## 5 medium 5
## question_number_of_correct_choice row_id is_correct
## 1 1 1761 0.4550188
## 2 1 1889 0.4850934
## 3 1 1934 0.4884480
## 4 1 4531 0.3492971
## 5 1 5358 0.6929855
# Arrange
# Arrange
is_correct %>%
filter(topic=='28452643' | topic == '53038110' | topic=='173821568') %>%
arrange(desc(is_correct))
## user_id session_id session_no topic sub_topic learning_node
## 1 5412865919 72141290 1 173821568 5665086549 9595128676
## 2 874390028 9139494762 1 173821568 5665086549 9595128676
## 3 5288067940 3307982526 9 173821568 5665086549 9595128676
## 4 7604098932 4153886993 1 53038110 3335170278 4686251457
## 5 9337656425 7331968311 3 28452643 5665086549 5973815917
## 6 3039027676 6617483178 3 173821568 5665086549 9595128676
## 7 2726714781 6837889277 1 53038110 3335170278 2557121605
## 8 2671732382 8267831852 5 53038110 5200191480 9373062785
## 9 2487306790 9151419864 1 53038110 3335170278 3639410144
## 10 6434646443 8570039313 1 53038110 4574090690 3233119321
## 11 8579827264 8088725052 1 28452643 5665086549 5973815917
## 12 6599726005 3196085126 1 173821568 5665086549 9595128676
## 13 6339610957 3865314666 20 28452643 5665086549 6192822276
## 14 5105146951 6208711696 3 173821568 5665086549 9595128676
## 15 5663543357 1144964174 4 28452643 5665086549 2411866361
## question_id question_type session_question_no learning_node_question_no
## 1 9053675445 Single choice 2 2
## 2 9053675445 Single choice 2 2
## 3 9053675445 Single choice 2 2
## 4 3158312426 Single choice 2 2
## 5 4240925782 Single choice 1 1
## 6 8921810238 Single choice 2 2
## 7 8303653933 Single choice 5 1
## 8 2349677010 Single choice 14 2
## 9 8061555455 Single choice 14 2
## 10 1093066137 Single choice 14 2
## 11 4240925782 Single choice 4 2
## 12 8921810238 Single choice 2 2
## 13 1790818228 Single choice 10 2
## 14 9053675445 Single choice 2 2
## 15 2574764738 Single choice 7 1
## question_difficulty question_number_of_choice
## 1 4
## 2 4
## 3 4
## 4 medium 5
## 5 4
## 6 4
## 7 medium 5
## 8 medium 5
## 9 medium 5
## 10 medium 5
## 11 4
## 12 4
## 13 4
## 14 4
## 15 4
## question_number_of_correct_choice row_id is_correct
## 1 1 3816 0.90210946
## 2 1 613 0.86499660
## 3 1 3729 0.70881678
## 4 1 5358 0.69298546
## 5 1 6540 0.62240432
## 6 1 2163 0.59751272
## 7 1 1934 0.48844802
## 8 1 1889 0.48509344
## 9 1 1761 0.45501879
## 10 1 4531 0.34929711
## 11 1 6042 0.33414826
## 12 1 4647 0.21317235
## 13 1 4457 0.18406078
## 14 1 3593 0.14251575
## 15 1 3985 0.02520149
# Summarise
is_correct %>%
summarise(Avg_is_correct = mean(is_correct),
sd_is_correct = sd(is_correct),
max_is_correct = max(is_correct),
min_is_correct = min(is_correct),
sum_is_correct = sum(is_correct),
median_is_correct = median(is_correct),
total = n())
## Avg_is_correct sd_is_correct max_is_correct min_is_correct sum_is_correct
## 1 0.5018979 0.2890873 0.9998332 9.11362e-05 3513.285
## median_is_correct total
## 1 0.4965026 7000
# Group by
is_correct %>%
group_by(is_correct) %>%
summarise(Avg_is_correct = mean(is_correct),
sd_is_correct = sd(is_correct),
max_is_correct = max(is_correct),
min_is_correct = min(is_correct),
sum_is_correct = sum(is_correct),
median_is_correct = median(is_correct),
total = n()) %>%
arrange(desc(Avg_is_correct))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 7,000 x 8
## is_correct Avg_is_correct sd_is_correct max_is_correct min_is_correct
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.00 1.00 NA 1.00 1.00
## 2 1.00 1.00 NA 1.00 1.00
## 3 0.999 0.999 NA 0.999 0.999
## 4 0.999 0.999 NA 0.999 0.999
## 5 0.999 0.999 NA 0.999 0.999
## 6 0.998 0.998 NA 0.998 0.998
## 7 0.998 0.998 NA 0.998 0.998
## 8 0.998 0.998 NA 0.998 0.998
## 9 0.998 0.998 NA 0.998 0.998
## 10 0.998 0.998 NA 0.998 0.998
## # ... with 6,990 more rows, and 3 more variables: sum_is_correct <dbl>,
## # median_is_correct <dbl>, total <int>
# Visualization
# Histogram
is_correct %>%
filter(topic=='28452643' | topic=='173821568') %>%
ggplot(aes(x=topic, fill = is_correct)) +
geom_histogram(alpha=0.8, color='darkblue') +
ggtitle('Topics28452643And173821568 is_correct') +
facet_wrap(~is_correct)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Density
is_correct %>%
filter(topic=='28452643' | topic=='173821568') %>%
ggplot(aes(x=topic, fill = is_correct)) +
geom_density(alpha=0.5, color='darkblue') +
ggtitle('Topics28452643And173821568 is_correct Density')

# Scatter
is_correct %>%
filter(topic=='28452643' | topic=='173821568') %>%
ggplot(aes(x=topic, y = session_question_no, col=is_correct, size=question_number_of_choice)) +
geom_point(alpha=0.5, color='darkblue') +
geom_smooth(se=0) +
facet_wrap(~is_correct)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# Box plot
is_correct %>%
group_by(is_correct) %>%
filter(n() >0.7) %>%
ggplot(aes(group= learning_node_question_no, x=topic, y= session_question_no, col = is_correct)) +
geom_boxplot()

#Read Data File
getwd()
## [1] "D:/LOWONGAN 2022/RUANG GURU 2022/ruangguru/eda_ruangguru"
train <- read.csv("train.csv", header = TRUE)
str(train)
## 'data.frame': 286886 obs. of 18 variables:
## $ user_id : num 2348875 2348875 2348875 2766044 2766044 ...
## $ session_id : num 5.47e+09 5.47e+09 3.21e+09 9.61e+09 7.26e+09 ...
## $ session_no : num 1 1 2 1 2 2 2 2 2 2 ...
## $ topic : num 1.06e+09 1.06e+09 1.06e+09 4.79e+09 4.79e+09 ...
## $ sub_topic : num 6.16e+09 6.16e+09 6.16e+09 6.47e+09 2.16e+09 ...
## $ learning_node : num 1.68e+09 1.68e+09 1.68e+09 3.79e+09 7.86e+09 ...
## $ question_id : num 2.23e+09 2.23e+09 5.59e+09 1.24e+09 4.73e+09 ...
## $ question_type : chr "Single choice" "Single choice" "Single choice" "Single choice" ...
## $ session_question_no : int 1 2 1 1 1 2 3 4 5 6 ...
## $ learning_node_question_no : int 1 2 1 1 1 1 1 1 1 1 ...
## $ question_difficulty : chr "medium" "medium" "medium" "medium" ...
## $ question_number_of_choice : int 4 4 4 4 4 4 4 4 4 4 ...
## $ question_number_of_correct_choice : int 1 1 1 1 1 1 1 1 1 1 ...
## $ question_number_of_correct_selected: int 0 0 1 1 1 0 1 1 1 0 ...
## $ question_number_of_wrong_selected : int 1 1 0 0 0 1 0 0 0 1 ...
## $ ms_first_response : num 18 12 11 47 7 88 22 8 19 28 ...
## $ is_correct : num 0 0 1 1 1 0 1 1 1 0 ...
## $ row_id : int 0 1 2 3 4 5 6 7 8 9 ...
#Logistic Regression Model
library(nnet)
## Warning: package 'nnet' was built under R version 4.0.5
mymodel <- multinom (is_correct~., data = train)
## # weights: 23 (22 variable)
## initial value 198854.222043
## iter 10 value 186352.071955
## iter 20 value 186340.893043
## final value 186340.885090
## converged
#Misclassification Rate
p <- predict(mymodel, train)
tab <- table(p, train$is_correct)
tab
##
## p 0 1
## 0 0 0
## 1 102057 184829
1-sum(diag(tab))/sum(tab)
## [1] 0.3557406
table(train$is_correct)
##
## 0 1
## 102057 184829
#Model Performance Evaluation
library(ROCR)
## Warning: package 'ROCR' was built under R version 4.0.5
pred <- predict(mymodel, train, type = 'prob')
pred <- prediction(pred, train$is_correct)
eval <- performance(pred, "acc")
plot(eval)
abline(h=0.71, v=0.45)

#Identify Best Values
max <- which.max(slot(eval, "y.values")[[1]])
acc <- slot(eval, "y.values")[[1]][max]
cut <- slot(eval, "x.values")[[1]][max]
print(c(Accuracy=acc, cutoff = cut))
## Accuracy cutoff.206991
## 0.6444372 0.5328187
#Misclassification Rate
p <- predict(mymodel, train)
tab <- table(p, train$is_correct)
tab
##
## p 0 1
## 0 0 0
## 1 102057 184829
1-sum(diag(tab))/sum(tab)
## [1] 0.3557406
table(train$is_correct)
##
## 0 1
## 102057 184829
#Model Performance Evaluation
library(ROCR)
pred <- predict(mymodel, train, type = 'prob')
pred <- prediction(pred, train$is_correct)
eval <- performance(pred, "acc")
plot(eval)
abline(h=0.71, v=0.45)

#Identify Best Values
max <- which.max(slot(eval, "y.values")[[1]])
acc <- slot(eval, "y.values")[[1]][max]
cut <- slot(eval, "x.values")[[1]][max]
print(c(Accuracy=acc, cutoff = cut))
## Accuracy cutoff.206991
## 0.6444372 0.5328187