Greetings!. Today i will have a look at the Candy Crush dataset that i found online. While part of the code involves something from the Datacamp project, i do would like to take a further look into the dataset

Let us load the library first. I would go for tidyverse since it’s the mother of all packages because once you load it, you’ll also get to load other necessary packages:

1 Loading the Dataset

library(tidyverse)
## -- Attaching packages ----------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.0.0     v purrr   0.2.5
## v tibble  1.4.2     v dplyr   0.7.6
## v tidyr   0.8.1     v stringr 1.3.1
## v readr   1.1.1     v forcats 0.3.0
## -- Conflicts -------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

2 Getting the data

library(readxl)
candy_crush <- read_excel("D:/Working Directory/candy_crush.xlsx")
View(candy_crush)

3 Examining the data

head(candy_crush)
## # A tibble: 6 x 5
##   player_id             dt                  level num_attempts num_success
##   <chr>                 <dttm>              <dbl>        <dbl>       <dbl>
## 1 6dd5af4c7228fa353d50~ 2014-01-04 00:00:00     4            3           1
## 2 c7ec97c39349ab7e4d39~ 2014-01-01 00:00:00     8            4           1
## 3 c7ec97c39349ab7e4d39~ 2014-01-05 00:00:00    12            6           0
## 4 a32c5e9700ed356dc8dd~ 2014-01-03 00:00:00    11            1           1
## 5 a32c5e9700ed356dc8dd~ 2014-01-07 00:00:00    15            6           0
## 6 b94d403ac4edf639442f~ 2014-01-01 00:00:00     8            8           1
str(candy_crush)
## Classes 'tbl_df', 'tbl' and 'data.frame':    16865 obs. of  5 variables:
##  $ player_id   : chr  "6dd5af4c7228fa353d505767143f5815" "c7ec97c39349ab7e4d39b4f74062ec13" "c7ec97c39349ab7e4d39b4f74062ec13" "a32c5e9700ed356dc8dd5bb3230c5227" ...
##  $ dt          : POSIXct, format: "2014-01-04" "2014-01-01" ...
##  $ level       : num  4 8 12 11 15 8 12 12 4 15 ...
##  $ num_attempts: num  3 4 6 1 6 8 15 18 1 27 ...
##  $ num_success : num  1 1 0 1 0 1 1 0 1 0 ...
summary(candy_crush)
##   player_id               dt                          level       
##  Length:16865       Min.   :2014-01-01 00:00:00   Min.   : 1.000  
##  Class :character   1st Qu.:2014-01-02 00:00:00   1st Qu.: 6.000  
##  Mode  :character   Median :2014-01-04 00:00:00   Median : 9.000  
##                     Mean   :2014-01-04 01:06:15   Mean   : 9.287  
##                     3rd Qu.:2014-01-06 00:00:00   3rd Qu.:14.000  
##                     Max.   :2014-01-07 00:00:00   Max.   :15.000  
##   num_attempts      num_success     
##  Min.   :  0.000   Min.   : 0.0000  
##  1st Qu.:  1.000   1st Qu.: 0.0000  
##  Median :  3.000   Median : 1.0000  
##  Mean   :  5.535   Mean   : 0.6272  
##  3rd Qu.:  7.000   3rd Qu.: 1.0000  
##  Max.   :258.000   Max.   :55.0000
names(candy_crush)
## [1] "player_id"    "dt"           "level"        "num_attempts"
## [5] "num_success"

4 Having a visual look at the data

candy_crush %>%
  ggplot(aes(level, num_attempts, col = num_success)) + 
  geom_point() + 
  labs(x = "\n Candy Crush levels \n") + 
  labs(y = "\n Number of attempts per player \n") + 
  labs(title = "\n Scatterpoint Distribution of Attempts per level \n")

5 Analysis with respect to difficulty

5.1 Let us check the number of players.

print("Number of players")
## [1] "Number of players"
length(unique(candy_crush$player_id))
## [1] 6814

5.2 Computing level difficulty

difficulty <- candy_crush %>%
  group_by(level) %>%
  summarise(wins = sum(num_success), attempts = sum(num_attempts)) %>%
  mutate(p_win = wins/attempts) %>%
  mutate(p_loss = (attempts-wins)/attempts)

print(difficulty)
## # A tibble: 15 x 5
##    level  wins attempts  p_win p_loss
##    <dbl> <dbl>    <dbl>  <dbl>  <dbl>
##  1     1   818     1322 0.619   0.381
##  2     2   666     1285 0.518   0.482
##  3     3   662     1546 0.428   0.572
##  4     4   705     1893 0.372   0.628
##  5     5   634     6937 0.0914  0.909
##  6     6   668     1591 0.420   0.580
##  7     7   614     4526 0.136   0.864
##  8     8   641    15816 0.0405  0.959
##  9     9   670     8241 0.0813  0.919
## 10    10   617     3282 0.188   0.812
## 11    11   603     5575 0.108   0.892
## 12    12   659     6868 0.0960  0.904
## 13    13   686     1327 0.517   0.483
## 14    14   777     2772 0.280   0.720
## 15    15  1157    30374 0.0381  0.962

5.2.1 Plotting difficulty profile

Given that we now have information for all 15 levels, as we try to measure difficulty as the probability of success at a difficulty level in a single attempt, a lower value implies a higher level of difficulity.

Let us try to plot the level of difficulity with respect to the level.

p <- ggplot(difficulty, aes(x = level, y = p_win)) + geom_line()
p <- p + labs(x = "\n Level in Candy Crush \n")
p <- p + labs(y = "\n Percentage Win \n")

print(p)

5.2.2 Standard error for p_win for each level

difficulty <- difficulty %>%
  mutate(error = sqrt(p_win * (1 - p_win)/attempts))

print(difficulty)
## # A tibble: 15 x 6
##    level  wins attempts  p_win p_loss   error
##    <dbl> <dbl>    <dbl>  <dbl>  <dbl>   <dbl>
##  1     1   818     1322 0.619   0.381 0.0134 
##  2     2   666     1285 0.518   0.482 0.0139 
##  3     3   662     1546 0.428   0.572 0.0126 
##  4     4   705     1893 0.372   0.628 0.0111 
##  5     5   634     6937 0.0914  0.909 0.00346
##  6     6   668     1591 0.420   0.580 0.0124 
##  7     7   614     4526 0.136   0.864 0.00509
##  8     8   641    15816 0.0405  0.959 0.00157
##  9     9   670     8241 0.0813  0.919 0.00301
## 10    10   617     3282 0.188   0.812 0.00682
## 11    11   603     5575 0.108   0.892 0.00416
## 12    12   659     6868 0.0960  0.904 0.00355
## 13    13   686     1327 0.517   0.483 0.0137 
## 14    14   777     2772 0.280   0.720 0.00853
## 15    15  1157    30374 0.0381  0.962 0.00110

5.2.3 Plotting uncertainty

Let us try to use error barplot to show the uncertainty in the plot. We’ll try to set the length of the error bars to one standard error. Upper and lower limit ought to be p_win + error and p_win - error respectively.

p <- ggplot(difficulty, aes(x = level, y = p_win)) + geom_line() + geom_point() + geom_hline(yintercept = 1)
p <- p + geom_errorbar(aes(ymin = p_win - error, ymax = p_win + error))
p <- p + labs(x = "\n Level in Candy Crush \n")
p <- p + labs(y = "\n Percentage of win \n")

print(p)

5.2.4 Final metric

What is the probability of completing the episode without any major hitch i.e. losing a single time

a <- prod(difficulty$p_win)

print(a)
## [1] 9.447141e-12

6 Analysis with respect for losses

This wasn’t part of the Datacamp project but i will expand it anyway

6.1 Plotting loss profile

Inversely, let us plot the loss profile as the probability of loss at a difficulty level in a single attempt.

q <- ggplot(difficulty, aes(x = level, y = p_loss)) + geom_line()
q <- q + labs(x = "\n Level in Candy Crush \n")
q <- q + labs(y = "\n Percentage of Loss \n")

print(q)

6.2 Determining standard error for losses

difficulty <- difficulty %>%
    mutate(error_loss = sqrt(p_loss * (1 - p_loss)/attempts))

print(difficulty)
## # A tibble: 15 x 7
##    level  wins attempts  p_win p_loss   error error_loss
##    <dbl> <dbl>    <dbl>  <dbl>  <dbl>   <dbl>      <dbl>
##  1     1   818     1322 0.619   0.381 0.0134     0.0134 
##  2     2   666     1285 0.518   0.482 0.0139     0.0139 
##  3     3   662     1546 0.428   0.572 0.0126     0.0126 
##  4     4   705     1893 0.372   0.628 0.0111     0.0111 
##  5     5   634     6937 0.0914  0.909 0.00346    0.00346
##  6     6   668     1591 0.420   0.580 0.0124     0.0124 
##  7     7   614     4526 0.136   0.864 0.00509    0.00509
##  8     8   641    15816 0.0405  0.959 0.00157    0.00157
##  9     9   670     8241 0.0813  0.919 0.00301    0.00301
## 10    10   617     3282 0.188   0.812 0.00682    0.00682
## 11    11   603     5575 0.108   0.892 0.00416    0.00416
## 12    12   659     6868 0.0960  0.904 0.00355    0.00355
## 13    13   686     1327 0.517   0.483 0.0137     0.0137 
## 14    14   777     2772 0.280   0.720 0.00853    0.00853
## 15    15  1157    30374 0.0381  0.962 0.00110    0.00110

6.3 Plotting uncertainty

q <- ggplot(difficulty, aes(x = level, y = p_loss)) + geom_line() + geom_point() + geom_hline(yintercept = 1)
q <- q + geom_errorbar(aes(ymin = p_loss - error_loss, ymax = p_loss + error_loss))
q <- q + labs(x = "\n Level in Candy Crush \n")
q <- q + labs(y = "\n Percentage of Loss \n")

print(q)

6.4 Probability of defeat

q <- prod(difficulty$p_loss)

print(q)
## [1] 0.005794269

7 Applying concepts of Machine Learning onto the data

7.0.1 Filtering the data

I’m going to try to deselect player ID and dt in an attempt to apply the principles of machine learning on this.

candy_crush_vars <- candy_crush %>%
  select(-player_id, -dt)

print(candy_crush_vars)
## # A tibble: 16,865 x 3
##    level num_attempts num_success
##    <dbl>        <dbl>       <dbl>
##  1     4            3           1
##  2     8            4           1
##  3    12            6           0
##  4    11            1           1
##  5    15            6           0
##  6     8            8           1
##  7    12           15           1
##  8    12           18           0
##  9     4            1           1
## 10    15           27           0
## # ... with 16,855 more rows

7.0.2 Fitting in a linear model

fit <- lm(num_attempts ~ level + num_success, data = candy_crush_vars)
summary(fit)
## 
## Call:
## lm(formula = num_attempts ~ level + num_success, data = candy_crush_vars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -7.647  -3.647  -1.665   1.263 251.801 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.10007    0.14361  14.624   <2e-16 ***
## level        0.36205    0.01287  28.125   <2e-16 ***
## num_success  0.11628    0.06466   1.798   0.0721 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.155 on 16862 degrees of freedom
## Multiple R-squared:  0.04531,    Adjusted R-squared:  0.04519 
## F-statistic: 400.1 on 2 and 16862 DF,  p-value: < 2.2e-16

7.1 Loading an important library for Machine Learning

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift

7.2 Creating test and train sets

We will have to split the dataset into two. One as a training dataset and the other as a testing dataset in order to estimate how well it’s going to perform with new data. Caret’s pretty good with it. I’ll try to split it to a 80-20 ratio.

set.seed(6969) #I just love the number hahahahahahahaa. 

in_train <- createDataPartition(candy_crush_vars$level, p = 0.8, list = FALSE)

training <- candy_crush_vars[in_train, ]
testing <- candy_crush_vars[-in_train, ]

Just out of curiosity, i will have a little look at the training and testing data

head(training)
## # A tibble: 6 x 3
##   level num_attempts num_success
##   <dbl>        <dbl>       <dbl>
## 1     4            3           1
## 2    12            6           0
## 3    11            1           1
## 4    15            6           0
## 5     8            8           1
## 6     4            1           1
str(training)
## Classes 'tbl_df', 'tbl' and 'data.frame':    13494 obs. of  3 variables:
##  $ level       : num  4 12 11 15 8 4 15 15 8 15 ...
##  $ num_attempts: num  3 6 1 6 8 1 27 15 8 3 ...
##  $ num_success : num  1 0 1 0 1 1 0 0 0 0 ...
summary(training)
##      level         num_attempts      num_success     
##  Min.   : 1.000   Min.   :  0.000   Min.   : 0.0000  
##  1st Qu.: 6.000   1st Qu.:  1.000   1st Qu.: 0.0000  
##  Median : 9.000   Median :  3.000   Median : 1.0000  
##  Mean   : 9.294   Mean   :  5.548   Mean   : 0.6257  
##  3rd Qu.:14.000   3rd Qu.:  7.000   3rd Qu.: 1.0000  
##  Max.   :15.000   Max.   :258.000   Max.   :55.0000
nrow(training)
## [1] 13494
head(testing)
## # A tibble: 6 x 3
##   level num_attempts num_success
##   <dbl>        <dbl>       <dbl>
## 1     8            4           1
## 2    12           15           1
## 3    12           18           0
## 4    15            5           0
## 5    14            7           1
## 6     4            3           1
str(testing)
## Classes 'tbl_df', 'tbl' and 'data.frame':    3371 obs. of  3 variables:
##  $ level       : num  8 12 12 15 14 4 5 12 13 1 ...
##  $ num_attempts: num  4 15 18 5 7 3 15 5 2 1 ...
##  $ num_success : num  1 1 0 0 1 1 0 1 1 0 ...
summary(testing)
##      level        num_attempts      num_success     
##  Min.   : 1.00   Min.   :  0.000   Min.   : 0.0000  
##  1st Qu.: 6.00   1st Qu.:  1.000   1st Qu.: 0.0000  
##  Median : 9.00   Median :  3.000   Median : 1.0000  
##  Mean   : 9.26   Mean   :  5.486   Mean   : 0.6331  
##  3rd Qu.:13.00   3rd Qu.:  7.000   3rd Qu.: 1.0000  
##  Max.   :15.00   Max.   :138.000   Max.   :46.0000
nrow(testing)
## [1] 3371

7.2.1 Training the model…no bootstrapping yet

fit_lm <- train(num_success ~., method = "lm", data = training, trControl = trainControl(method = "none"))

print(fit_lm)
## Linear Regression 
## 
## 13494 samples
##     2 predictor
## 
## No pre-processing
## Resampling: None

7.2.2 Training a random forest model…same thing

fit_rf <- train(num_success ~., method = "rf", data = training, trControl = trainControl(method = "none"))

print(fit_rf)
## Random Forest 
## 
## 13494 samples
##     2 predictor
## 
## No pre-processing
## Resampling: None

7.3 Evaluating the model

7.3.1 Let us load the library

library(yardstick)
## Loading required package: broom
## 
## Attaching package: 'yardstick'
## The following objects are masked from 'package:caret':
## 
##     mnLogLoss, precision, recall
## The following object is masked from 'package:readr':
## 
##     spec

7.3.2 Creating the new columns using the training data

results <- training %>%
  mutate(`Linear Regression` = predict(fit_lm, training), 
         `Random Forest` = predict(fit_rf, training))

metrics(results, truth = level, estimate = `Linear Regression`)
## # A tibble: 1 x 2
##    rmse   rsq
##   <dbl> <dbl>
## 1  9.76 0.999
metrics(results, truth = level, estimate = `Random Forest`)
## # A tibble: 1 x 2
##    rmse   rsq
##   <dbl> <dbl>
## 1  9.76 0.119
print(results)
## # A tibble: 13,494 x 5
##    level num_attempts num_success `Linear Regression` `Random Forest`
##    <dbl>        <dbl>       <dbl>               <dbl>           <dbl>
##  1     4            3           1               0.799           0.983
##  2    12            6           0               0.536           0.549
##  3    11            1           1               0.567           0.674
##  4    15            6           0               0.437           0.334
##  5     8            8           1               0.670           0.401
##  6     4            1           1               0.798           0.936
##  7    15           27           0               0.448           0.337
##  8    15           15           0               0.442           0.355
##  9     8            8           0               0.670           0.401
## 10    15            3           0               0.436           0.415
## # ... with 13,484 more rows

7.3.2.1 Plotting the results

results %>%
    gather(Method, Result, `Linear Regression`:`Random Forest`) %>%
    ggplot(aes(num_success, Result, color = Method)) +
    geom_point(size = 1.5, alpha = 0.5) +
    facet_wrap(~Method) +
    geom_abline(lty = 2, color = "gray50") +
    geom_smooth(method = "lm")

7.3.3 Same thing but with testing data

results <- testing %>%
  mutate(`Linear Regression` = predict(fit_lm, testing), 
         `Random Forest` = predict(fit_rf, testing))

metrics(results, truth = level, estimate = `Linear Regression`)
## # A tibble: 1 x 2
##    rmse   rsq
##   <dbl> <dbl>
## 1  9.74 0.999
metrics(results, truth = level, estimate = `Random Forest`)
## # A tibble: 1 x 2
##    rmse   rsq
##   <dbl> <dbl>
## 1  9.75 0.155
print(results)
## # A tibble: 3,371 x 5
##    level num_attempts num_success `Linear Regression` `Random Forest`
##    <dbl>        <dbl>       <dbl>               <dbl>           <dbl>
##  1     8            4           1               0.668           0.467
##  2    12           15           1               0.541           0.618
##  3    12           18           0               0.542           0.506
##  4    15            5           0               0.437           0.330
##  5    14            7           1               0.471           0.900
##  6     4            3           1               0.799           0.983
##  7     5           15           0               0.772           0.424
##  8    12            5           1               0.536           0.438
##  9    13            2           1               0.502           0.868
## 10     1            1           0               0.898           0.995
## # ... with 3,361 more rows

7.3.3.1 Plotting the results here

results %>%
    gather(Method, Result, `Linear Regression`:`Random Forest`) %>%
    ggplot(aes(num_success, Result, color = Method)) +
    geom_point(size = 1.5, alpha = 0.5) +
    facet_wrap(~Method) +
    geom_abline(lty = 2, color = "gray50") +
    geom_smooth(method = "lm")

8 Note

For those who want a csv copy of this dataset, i have it on github. You can get the csv raw from this link: https://raw.githubusercontent.com/joelrudinas03/Level-Difficulty-in-Candy-Crush-Saga-/master/candy_crush.csv

Grazie!