eda.utf8

# Filter
is_correct %>%
  filter(topic=='28452643' | topic == '53038110' | topic=='173821568')

##       user_id session_id session_no     topic  sub_topic learning_node
## 1   874390028 9139494762          1 173821568 5665086549    9595128676
## 2  2487306790 9151419864          1  53038110 3335170278    3639410144
## 3  2671732382 8267831852          5  53038110 5200191480    9373062785
## 4  2726714781 6837889277          1  53038110 3335170278    2557121605
## 5  3039027676 6617483178          3 173821568 5665086549    9595128676
## 6  5105146951 6208711696          3 173821568 5665086549    9595128676
## 7  5288067940 3307982526          9 173821568 5665086549    9595128676
## 8  5412865919   72141290          1 173821568 5665086549    9595128676
## 9  5663543357 1144964174          4  28452643 5665086549    2411866361
## 10 6339610957 3865314666         20  28452643 5665086549    6192822276
## 11 6434646443 8570039313          1  53038110 4574090690    3233119321
## 12 6599726005 3196085126          1 173821568 5665086549    9595128676
## 13 7604098932 4153886993          1  53038110 3335170278    4686251457
## 14 8579827264 8088725052          1  28452643 5665086549    5973815917
## 15 9337656425 7331968311          3  28452643 5665086549    5973815917
##    question_id question_type session_question_no learning_node_question_no
## 1   9053675445 Single choice                   2                         2
## 2   8061555455 Single choice                  14                         2
## 3   2349677010 Single choice                  14                         2
## 4   8303653933 Single choice                   5                         1
## 5   8921810238 Single choice                   2                         2
## 6   9053675445 Single choice                   2                         2
## 7   9053675445 Single choice                   2                         2
## 8   9053675445 Single choice                   2                         2
## 9   2574764738 Single choice                   7                         1
## 10  1790818228 Single choice                  10                         2
## 11  1093066137 Single choice                  14                         2
## 12  8921810238 Single choice                   2                         2
## 13  3158312426 Single choice                   2                         2
## 14  4240925782 Single choice                   4                         2
## 15  4240925782 Single choice                   1                         1
##    question_difficulty question_number_of_choice
## 1                                              4
## 2               medium                         5
## 3               medium                         5
## 4               medium                         5
## 5                                              4
## 6                                              4
## 7                                              4
## 8                                              4
## 9                                              4
## 10                                             4
## 11              medium                         5
## 12                                             4
## 13              medium                         5
## 14                                             4
## 15                                             4
##    question_number_of_correct_choice row_id is_correct
## 1                                  1    613 0.86499660
## 2                                  1   1761 0.45501879
## 3                                  1   1889 0.48509344
## 4                                  1   1934 0.48844802
## 5                                  1   2163 0.59751272
## 6                                  1   3593 0.14251575
## 7                                  1   3729 0.70881678
## 8                                  1   3816 0.90210946
## 9                                  1   3985 0.02520149
## 10                                 1   4457 0.18406078
## 11                                 1   4531 0.34929711
## 12                                 1   4647 0.21317235
## 13                                 1   5358 0.69298546
## 14                                 1   6042 0.33414826
## 15                                 1   6540 0.62240432

is_correct %>%
  filter(topic=='53038110')

##      user_id session_id session_no    topic  sub_topic learning_node
## 1 2487306790 9151419864          1 53038110 3335170278    3639410144
## 2 2671732382 8267831852          5 53038110 5200191480    9373062785
## 3 2726714781 6837889277          1 53038110 3335170278    2557121605
## 4 6434646443 8570039313          1 53038110 4574090690    3233119321
## 5 7604098932 4153886993          1 53038110 3335170278    4686251457
##   question_id question_type session_question_no learning_node_question_no
## 1  8061555455 Single choice                  14                         2
## 2  2349677010 Single choice                  14                         2
## 3  8303653933 Single choice                   5                         1
## 4  1093066137 Single choice                  14                         2
## 5  3158312426 Single choice                   2                         2
##   question_difficulty question_number_of_choice
## 1              medium                         5
## 2              medium                         5
## 3              medium                         5
## 4              medium                         5
## 5              medium                         5
##   question_number_of_correct_choice row_id is_correct
## 1                                 1   1761  0.4550188
## 2                                 1   1889  0.4850934
## 3                                 1   1934  0.4884480
## 4                                 1   4531  0.3492971
## 5                                 1   5358  0.6929855

# Arrange
# Arrange
is_correct %>%
  filter(topic=='28452643' | topic == '53038110' | topic=='173821568') %>%
  arrange(desc(is_correct))

##       user_id session_id session_no     topic  sub_topic learning_node
## 1  5412865919   72141290          1 173821568 5665086549    9595128676
## 2   874390028 9139494762          1 173821568 5665086549    9595128676
## 3  5288067940 3307982526          9 173821568 5665086549    9595128676
## 4  7604098932 4153886993          1  53038110 3335170278    4686251457
## 5  9337656425 7331968311          3  28452643 5665086549    5973815917
## 6  3039027676 6617483178          3 173821568 5665086549    9595128676
## 7  2726714781 6837889277          1  53038110 3335170278    2557121605
## 8  2671732382 8267831852          5  53038110 5200191480    9373062785
## 9  2487306790 9151419864          1  53038110 3335170278    3639410144
## 10 6434646443 8570039313          1  53038110 4574090690    3233119321
## 11 8579827264 8088725052          1  28452643 5665086549    5973815917
## 12 6599726005 3196085126          1 173821568 5665086549    9595128676
## 13 6339610957 3865314666         20  28452643 5665086549    6192822276
## 14 5105146951 6208711696          3 173821568 5665086549    9595128676
## 15 5663543357 1144964174          4  28452643 5665086549    2411866361
##    question_id question_type session_question_no learning_node_question_no
## 1   9053675445 Single choice                   2                         2
## 2   9053675445 Single choice                   2                         2
## 3   9053675445 Single choice                   2                         2
## 4   3158312426 Single choice                   2                         2
## 5   4240925782 Single choice                   1                         1
## 6   8921810238 Single choice                   2                         2
## 7   8303653933 Single choice                   5                         1
## 8   2349677010 Single choice                  14                         2
## 9   8061555455 Single choice                  14                         2
## 10  1093066137 Single choice                  14                         2
## 11  4240925782 Single choice                   4                         2
## 12  8921810238 Single choice                   2                         2
## 13  1790818228 Single choice                  10                         2
## 14  9053675445 Single choice                   2                         2
## 15  2574764738 Single choice                   7                         1
##    question_difficulty question_number_of_choice
## 1                                              4
## 2                                              4
## 3                                              4
## 4               medium                         5
## 5                                              4
## 6                                              4
## 7               medium                         5
## 8               medium                         5
## 9               medium                         5
## 10              medium                         5
## 11                                             4
## 12                                             4
## 13                                             4
## 14                                             4
## 15                                             4
##    question_number_of_correct_choice row_id is_correct
## 1                                  1   3816 0.90210946
## 2                                  1    613 0.86499660
## 3                                  1   3729 0.70881678
## 4                                  1   5358 0.69298546
## 5                                  1   6540 0.62240432
## 6                                  1   2163 0.59751272
## 7                                  1   1934 0.48844802
## 8                                  1   1889 0.48509344
## 9                                  1   1761 0.45501879
## 10                                 1   4531 0.34929711
## 11                                 1   6042 0.33414826
## 12                                 1   4647 0.21317235
## 13                                 1   4457 0.18406078
## 14                                 1   3593 0.14251575
## 15                                 1   3985 0.02520149

# Summarise
is_correct %>%
  summarise(Avg_is_correct = mean(is_correct),
            sd_is_correct = sd(is_correct),
            max_is_correct = max(is_correct),
            min_is_correct = min(is_correct),
            sum_is_correct = sum(is_correct),
            median_is_correct = median(is_correct),
            total = n())

##   Avg_is_correct sd_is_correct max_is_correct min_is_correct sum_is_correct
## 1      0.5018979     0.2890873      0.9998332    9.11362e-05       3513.285
##   median_is_correct total
## 1         0.4965026  7000

# Group by
is_correct %>%
  group_by(is_correct) %>%
  summarise(Avg_is_correct = mean(is_correct),
            sd_is_correct = sd(is_correct),
            max_is_correct = max(is_correct),
            min_is_correct = min(is_correct),
            sum_is_correct = sum(is_correct),
            median_is_correct = median(is_correct),
            total = n()) %>%
  arrange(desc(Avg_is_correct))

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 7,000 x 8
##    is_correct Avg_is_correct sd_is_correct max_is_correct min_is_correct
##         <dbl>          <dbl>         <dbl>          <dbl>          <dbl>
##  1      1.00           1.00             NA          1.00           1.00 
##  2      1.00           1.00             NA          1.00           1.00 
##  3      0.999          0.999            NA          0.999          0.999
##  4      0.999          0.999            NA          0.999          0.999
##  5      0.999          0.999            NA          0.999          0.999
##  6      0.998          0.998            NA          0.998          0.998
##  7      0.998          0.998            NA          0.998          0.998
##  8      0.998          0.998            NA          0.998          0.998
##  9      0.998          0.998            NA          0.998          0.998
## 10      0.998          0.998            NA          0.998          0.998
## # ... with 6,990 more rows, and 3 more variables: sum_is_correct <dbl>,
## #   median_is_correct <dbl>, total <int>

# Visualization 
# Histogram
is_correct %>%
  filter(topic=='28452643' | topic=='173821568') %>%
  ggplot(aes(x=topic, fill = is_correct)) +
  geom_histogram(alpha=0.8, color='darkblue') +
  ggtitle('Topics28452643And173821568 is_correct') +
  facet_wrap(~is_correct)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Density
is_correct %>%
  filter(topic=='28452643' | topic=='173821568') %>%
  ggplot(aes(x=topic, fill = is_correct)) +
  geom_density(alpha=0.5, color='darkblue') +
  ggtitle('Topics28452643And173821568 is_correct Density')

# Scatter
is_correct %>%
  filter(topic=='28452643' | topic=='173821568') %>%
  ggplot(aes(x=topic, y = session_question_no, col=is_correct, size=question_number_of_choice)) +
  geom_point(alpha=0.5, color='darkblue') +
  geom_smooth(se=0) +
  facet_wrap(~is_correct)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# Box plot
is_correct %>%
  group_by(is_correct) %>%
  filter(n() >0.7) %>%
  ggplot(aes(group= learning_node_question_no, x=topic, y= session_question_no, col = is_correct)) +
  geom_boxplot()

#Read Data File
getwd()

## [1] "D:/LOWONGAN 2022/RUANG GURU 2022/ruangguru/eda_ruangguru"

train <- read.csv("train.csv", header = TRUE)
str(train)

## 'data.frame':    286886 obs. of  18 variables:
##  $ user_id                            : num  2348875 2348875 2348875 2766044 2766044 ...
##  $ session_id                         : num  5.47e+09 5.47e+09 3.21e+09 9.61e+09 7.26e+09 ...
##  $ session_no                         : num  1 1 2 1 2 2 2 2 2 2 ...
##  $ topic                              : num  1.06e+09 1.06e+09 1.06e+09 4.79e+09 4.79e+09 ...
##  $ sub_topic                          : num  6.16e+09 6.16e+09 6.16e+09 6.47e+09 2.16e+09 ...
##  $ learning_node                      : num  1.68e+09 1.68e+09 1.68e+09 3.79e+09 7.86e+09 ...
##  $ question_id                        : num  2.23e+09 2.23e+09 5.59e+09 1.24e+09 4.73e+09 ...
##  $ question_type                      : chr  "Single choice" "Single choice" "Single choice" "Single choice" ...
##  $ session_question_no                : int  1 2 1 1 1 2 3 4 5 6 ...
##  $ learning_node_question_no          : int  1 2 1 1 1 1 1 1 1 1 ...
##  $ question_difficulty                : chr  "medium" "medium" "medium" "medium" ...
##  $ question_number_of_choice          : int  4 4 4 4 4 4 4 4 4 4 ...
##  $ question_number_of_correct_choice  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ question_number_of_correct_selected: int  0 0 1 1 1 0 1 1 1 0 ...
##  $ question_number_of_wrong_selected  : int  1 1 0 0 0 1 0 0 0 1 ...
##  $ ms_first_response                  : num  18 12 11 47 7 88 22 8 19 28 ...
##  $ is_correct                         : num  0 0 1 1 1 0 1 1 1 0 ...
##  $ row_id                             : int  0 1 2 3 4 5 6 7 8 9 ...

#Logistic Regression Model
library(nnet)

## Warning: package 'nnet' was built under R version 4.0.5

mymodel <-  multinom (is_correct~., data = train)

## # weights:  23 (22 variable)
## initial  value 198854.222043 
## iter  10 value 186352.071955
## iter  20 value 186340.893043
## final  value 186340.885090 
## converged

#Misclassification Rate
p <- predict(mymodel, train)
tab <- table(p, train$is_correct)
tab

##    
## p        0      1
##   0      0      0
##   1 102057 184829

1-sum(diag(tab))/sum(tab)

## [1] 0.3557406

table(train$is_correct)

## 
##      0      1 
## 102057 184829

#Model Performance Evaluation
library(ROCR)

## Warning: package 'ROCR' was built under R version 4.0.5

pred <- predict(mymodel, train, type = 'prob')
pred <- prediction(pred, train$is_correct)
eval <- performance(pred, "acc")
plot(eval)
abline(h=0.71, v=0.45)

#Identify Best Values
max <- which.max(slot(eval, "y.values")[[1]])
acc <- slot(eval, "y.values")[[1]][max]
cut <- slot(eval, "x.values")[[1]][max]
print(c(Accuracy=acc, cutoff = cut))

##      Accuracy cutoff.206991 
##     0.6444372     0.5328187

#Misclassification Rate
p <- predict(mymodel, train)
tab <- table(p, train$is_correct)
tab

##    
## p        0      1
##   0      0      0
##   1 102057 184829

1-sum(diag(tab))/sum(tab)

## [1] 0.3557406

table(train$is_correct)

## 
##      0      1 
## 102057 184829

#Model Performance Evaluation
library(ROCR)
pred <- predict(mymodel, train, type = 'prob')
pred <- prediction(pred, train$is_correct)
eval <- performance(pred, "acc")
plot(eval)
abline(h=0.71, v=0.45)

#Identify Best Values
max <- which.max(slot(eval, "y.values")[[1]])
acc <- slot(eval, "y.values")[[1]][max]
cut <- slot(eval, "x.values")[[1]][max]
print(c(Accuracy=acc, cutoff = cut))

##      Accuracy cutoff.206991 
##     0.6444372     0.5328187