knitr::opts_chunk$set(echo = TRUE)

My Hw3 code lines

Read in/clean the dataset leetcode.csv

# Import readr and assign csv dataset to a variable
library(readr)
library(tidyverse)
library(tidyr) 
library(rstudioapi)

leetcode <- read_csv("C:\\Umass\\leetcode.csv")
#newData <- data.frame(leetcode)
view(leetcode)
# Getting the path of your current open file
#setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

# Preview the whole dataset before any operations
dim(leetcode)

[1] 1650    8

head(leetcode, n = 1650)

# A tibble: 1,650 × 8
   question_id video title     link  `total Accepted` `total Submitt…`
         <dbl> <lgl> <chr>     <chr>            <dbl>            <dbl>
 1        1959 NA    Minimum … http…              262              434
 2        1949 NA    Implemen… http…              691             1126
 3        1947 NA    Number o… http…             1976             7740
 4        1946 NA    Minimum … http…             5411            13595
 5        1945 NA    Finding … http…             5890             7460
 6        1944 NA    Truncate… http…             7056             8979
 7        1943 NA    Count Pa… http…              584              850
 8        1937 NA    Maximize… http…              571              819
 9        1936 NA    Maximize… http…             2958            10959
10        1935 NA    Minimum … http…             7134            10125
# … with 1,640 more rows, and 2 more variables: difficulty <dbl>,
#   isPaid <lgl>

# Check NA availability if any
colSums(is.na(leetcode))

    question_id           video           title            link 
              0             701               0               0 
 total Accepted total Submitted      difficulty          isPaid 
              0               0               0               0

# Replace the NA value in videos as "No solution" since FALSE and TRUE have been written down, rename TRUE as "Video" and FALSE as "Written"
#replace_na(leetcode, list(video = "No solution"))
# Calling str() function to confirm
str(leetcode)

spec_tbl_df [1,650 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ question_id    : num [1:1650] 1959 1949 1947 1946 1945 ...
 $ video          : logi [1:1650] NA NA NA NA NA NA ...
 $ title          : chr [1:1650] "Minimum Path Cost in a Hidden Grid" "Implement Trie II (Prefix Tree)" "Number of Different Subsequences GCDs" "Minimum Absolute Sum Difference" ...
 $ link           : chr [1:1650] "https://leetcode.com/problems/minimum-path-cost-in-a-hidden-grid" "https://leetcode.com/problems/implement-trie-ii-prefix-tree" "https://leetcode.com/problems/number-of-different-subsequences-gcds" "https://leetcode.com/problems/minimum-absolute-sum-difference" ...
 $ total Accepted : num [1:1650] 262 691 1976 5411 5890 ...
 $ total Submitted: num [1:1650] 434 1126 7740 13595 7460 ...
 $ difficulty     : num [1:1650] 2 2 3 2 2 1 2 3 3 2 ...
 $ isPaid         : logi [1:1650] TRUE TRUE FALSE FALSE FALSE FALSE ...
 - attr(*, "spec")=
  .. cols(
  ..   question_id = col_double(),
  ..   video = col_logical(),
  ..   title = col_character(),
  ..   link = col_character(),
  ..   `total Accepted` = col_double(),
  ..   `total Submitted` = col_double(),
  ..   difficulty = col_double(),
  ..   isPaid = col_logical()
  .. )
 - attr(*, "problems")=<externalptr>

leetcode <- leetcode %>%
  mutate(solution = case_when(
    video == TRUE ~ "Video",
    video == FALSE ~ "Written",
    video == NA ~ "No solution",))
# Replace the numercal value in difficulty 
leetcode<-mutate(leetcode, trueDifficulty = recode(difficulty, `1` = "easy", `2` = "medium", `3` = "difficult"))
# Define the column rate which could be calculated by division between totalAccepted andd totalSubmitted
acceptedRate <- select(leetcode, `total Accepted`)
totalSubmitted <- select(leetcode, `total Submitted`)
# Add new column named acceptedRate to leetcode dataset
leetcode <- mutate(leetcode, rate = (acceptedRate / totalSubmitted))
#rate <- transform(rate, ar = accepetedRate / totalSubmitted)
# Preview the rate to check whether it is right
head(leetcode, n = 1650)

# A tibble: 1,650 × 11
   question_id video title     link  `total Accepted` `total Submitt…`
         <dbl> <lgl> <chr>     <chr>            <dbl>            <dbl>
 1        1959 NA    Minimum … http…              262              434
 2        1949 NA    Implemen… http…              691             1126
 3        1947 NA    Number o… http…             1976             7740
 4        1946 NA    Minimum … http…             5411            13595
 5        1945 NA    Finding … http…             5890             7460
 6        1944 NA    Truncate… http…             7056             8979
 7        1943 NA    Count Pa… http…              584              850
 8        1937 NA    Maximize… http…              571              819
 9        1936 NA    Maximize… http…             2958            10959
10        1935 NA    Minimum … http…             7134            10125
# … with 1,640 more rows, and 5 more variables: difficulty <dbl>,
#   isPaid <lgl>, solution <chr>, trueDifficulty <chr>, rate <df[,1]>

Identify the variables in the dataset and the dataset

# Preview and get the column names of the dataset
head(leetcode)

# A tibble: 6 × 11
  question_id video title      link  `total Accepted` `total Submitt…`
        <dbl> <lgl> <chr>      <chr>            <dbl>            <dbl>
1        1959 NA    Minimum P… http…              262              434
2        1949 NA    Implement… http…              691             1126
3        1947 NA    Number of… http…             1976             7740
4        1946 NA    Minimum A… http…             5411            13595
5        1945 NA    Finding t… http…             5890             7460
6        1944 NA    Truncate … http…             7056             8979
# … with 5 more variables: difficulty <dbl>, isPaid <lgl>,
#   solution <chr>, trueDifficulty <chr>, rate <df[,1]>

colnames(leetcode)

 [1] "question_id"     "video"           "title"          
 [4] "link"            "total Accepted"  "total Submitted"
 [7] "difficulty"      "isPaid"          "solution"       
[10] "trueDifficulty"  "rate"

As you may see with colnames(), we have list all variables

Varible type classification:

String/char type : title, link, trueDifficult

title definition: the leetcode question description titles on the official website

link definition: offcial link to the specific question

trueDifficulty definition: here as you may know, we tranform 1 to easy difficulty, 2 to medium difficulty, 3 to hard difficulty. Usuallly, higher the difficulty, the more likely the user might spend time

Numeric type : question_id, total Accepted, total Submitted, difficulty, total Accepted(updated)

question_id definition: the number which corresponds to each question total Accepted definition: number of submitted solutions which can compile and run well

total Submitted definition: number of solutions/coding attempt submitted to the website, could be wrong

difficulty definition: in general, this represents the learning curve and how challenging a typical question could be

rate definition: we use the division between totalAccepted andd totalSubmitted to get the percentage which the submitted solution have been approved by the server

Logical type : video, isPaid

video definition: TRUE means the solution has video format, FALSE means written, NA means none

isPaid definition: TRUE means the question requires premium membership, FALSE means not requiring

Research questions

The questions are identified as below so far with this dataset:

-How is the difficulty of the questions correlated with the total submitted attempts?

-How is the acceptance rate which we have added as a new column correlate to the difficulty of the questions?

-Could the number id of the question affect the submitted attempts for each question?

-How is the video/written solution correlate to the submitted attempts and acceptance rate?

-Are the paid questions guaranteed to provide at least one form of written solution?

Hopefully, I could answer the above questions soon

library(dplyr)
#Use filter() to list all the questions without solutions
solvedQuestions <- filter(leetcode, is.na(`video`)) 
head(solvedQuestions)

# A tibble: 6 × 11
  question_id video title      link  `total Accepted` `total Submitt…`
        <dbl> <lgl> <chr>      <chr>            <dbl>            <dbl>
1        1959 NA    Minimum P… http…              262              434
2        1949 NA    Implement… http…              691             1126
3        1947 NA    Number of… http…             1976             7740
4        1946 NA    Minimum A… http…             5411            13595
5        1945 NA    Finding t… http…             5890             7460
6        1944 NA    Truncate … http…             7056             8979
# … with 5 more variables: difficulty <dbl>, isPaid <lgl>,
#   solution <chr>, trueDifficulty <chr>, rate <df[,1]>

#descend to find the questions with highest accepted rate
rateDescending <- arrange(leetcode, desc(`rate`))
head(select(leetcode, `rate`))

# A tibble: 6 × 1
  rate$`total Accepted`
                  <dbl>
1                 0.604
2                 0.614
3                 0.255
4                 0.398
5                 0.790
6                 0.786

#Hence we know could rank the questions in another order

Hw3HuidiDing

My Hw3 code lines

Read in/clean the dataset leetcode.csv

Identify the variables in the dataset and the dataset

Research questions