knitr::opts_chunk$set(echo = TRUE)

My Hw4 code lines and content,

Read in/clean the dataset leetcode.csv,

Compute descriptive statistics for each of variable including mean, median, and standard deviation for numerical variables, and frequencies for categorical variables,

Visualize using ggplot2 with both univariate and bivariate; Explain these graphs with variables, questions and conclusion,

Identify limitations of the visualization if any.

# Import readr and assign csv dataset to a variable
library(readr)
library(tidyverse)
library(tidyr) 
library(rstudioapi)
library(ggplot2)
library(dplyr)
library(plotrix)

# Getting the path of your current open file automatically
leetcode <- read_csv("leetcode.csv")
#newData <- data.frame(leetcode)
view(leetcode)

# Preview the whole dataset before any operations
dim(leetcode)

[1] 1650    8

head(leetcode, n = 1650)

# A tibble: 1,650 × 8
   question_id video title     link  `total Accepted` `total Submitt…`
         <dbl> <lgl> <chr>     <chr>            <dbl>            <dbl>
 1        1959 NA    Minimum … http…              262              434
 2        1949 NA    Implemen… http…              691             1126
 3        1947 NA    Number o… http…             1976             7740
 4        1946 NA    Minimum … http…             5411            13595
 5        1945 NA    Finding … http…             5890             7460
 6        1944 NA    Truncate… http…             7056             8979
 7        1943 NA    Count Pa… http…              584              850
 8        1937 NA    Maximize… http…              571              819
 9        1936 NA    Maximize… http…             2958            10959
10        1935 NA    Minimum … http…             7134            10125
# … with 1,640 more rows, and 2 more variables: difficulty <dbl>,
#   isPaid <lgl>

# Check NA availability if any
colSums(is.na(leetcode))

    question_id           video           title            link 
              0             701               0               0 
 total Accepted total Submitted      difficulty          isPaid 
              0               0               0               0

# Replace the NA value in videos as "No solution" since FALSE and TRUE have been written down, rename TRUE as "Video" and FALSE as "Written"
#replace_na(leetcode, list(video = "No solution"))
# Calling str() function to confirm
str(leetcode)

spec_tbl_df [1,650 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ question_id    : num [1:1650] 1959 1949 1947 1946 1945 ...
 $ video          : logi [1:1650] NA NA NA NA NA NA ...
 $ title          : chr [1:1650] "Minimum Path Cost in a Hidden Grid" "Implement Trie II (Prefix Tree)" "Number of Different Subsequences GCDs" "Minimum Absolute Sum Difference" ...
 $ link           : chr [1:1650] "https://leetcode.com/problems/minimum-path-cost-in-a-hidden-grid" "https://leetcode.com/problems/implement-trie-ii-prefix-tree" "https://leetcode.com/problems/number-of-different-subsequences-gcds" "https://leetcode.com/problems/minimum-absolute-sum-difference" ...
 $ total Accepted : num [1:1650] 262 691 1976 5411 5890 ...
 $ total Submitted: num [1:1650] 434 1126 7740 13595 7460 ...
 $ difficulty     : num [1:1650] 2 2 3 2 2 1 2 3 3 2 ...
 $ isPaid         : logi [1:1650] TRUE TRUE FALSE FALSE FALSE FALSE ...
 - attr(*, "spec")=
  .. cols(
  ..   question_id = col_double(),
  ..   video = col_logical(),
  ..   title = col_character(),
  ..   link = col_character(),
  ..   `total Accepted` = col_double(),
  ..   `total Submitted` = col_double(),
  ..   difficulty = col_double(),
  ..   isPaid = col_logical()
  .. )
 - attr(*, "problems")=<externalptr>

leetcode <- leetcode %>%
  mutate(solution = case_when(
    video == TRUE ~ "Video",
    video == FALSE ~ "Written",
    video == NA ~ "No solution",))

# Define the column rate which could be calculated by division between totalAccepted andd totalSubmitted
acceptedRate <- select(leetcode, `total Accepted`)
totalSubmitted <- select(leetcode, `total Submitted`)
# Add new column named acceptedRate to leetcode dataset
leetcode <- mutate(leetcode, rate = (acceptedRate / totalSubmitted))
#rate <- transform(rate, ar = accepetedRate / totalSubmitted)
# Preview the rate to check whether it is right
head(leetcode, n = 1650)

# A tibble: 1,650 × 10
   question_id video title     link  `total Accepted` `total Submitt…`
         <dbl> <lgl> <chr>     <chr>            <dbl>            <dbl>
 1        1959 NA    Minimum … http…              262              434
 2        1949 NA    Implemen… http…              691             1126
 3        1947 NA    Number o… http…             1976             7740
 4        1946 NA    Minimum … http…             5411            13595
 5        1945 NA    Finding … http…             5890             7460
 6        1944 NA    Truncate… http…             7056             8979
 7        1943 NA    Count Pa… http…              584              850
 8        1937 NA    Maximize… http…              571              819
 9        1936 NA    Maximize… http…             2958            10959
10        1935 NA    Minimum … http…             7134            10125
# … with 1,640 more rows, and 4 more variables: difficulty <dbl>,
#   isPaid <lgl>, solution <chr>, rate <df[,1]>

# Compute the median for the accepted solutions and submitted solutions
summarize(leetcode, mean.totalAccepted = mean(`total Accepted`, na.rm = TRUE))

# A tibble: 1 × 1
  mean.totalAccepted
               <dbl>
1            128905.

summarize(leetcode, mean.totalSubmitted = mean(`total Submitted`, na.rm = TRUE))

# A tibble: 1 × 1
  mean.totalSubmitted
                <dbl>
1             280704.

# Compute the average for the difficulty, 1.978 which means a bit below medium
summarize(leetcode, mean.difficulty = mean(difficulty, na.rm = TRUE))

# A tibble: 1 × 1
  mean.difficulty
            <dbl>
1            1.98

# Compute the sd for the accepted solutions
summarize(leetcode, sd.totalAccepted = sd(`total Accepted`, na.rm = TRUE))

# A tibble: 1 × 1
  sd.totalAccepted
             <dbl>
1          233157.

# Group by the dataset by the number level of difficulty and calculate the average for submitted count for each difficulty
leetcode %>%
  group_by(difficulty) %>%
  summarize(mean.totoalSubmittedDifficult=mean(`total Submitted`))

# A tibble: 3 × 2
  difficulty mean.totoalSubmittedDifficult
       <dbl>                         <dbl>
1          1                       418797.
2          2                       269992.
3          3                       155624.

# Compute freq table with leetcode$difficulty
frequencyDifficulty <- leetcode %>%
 select(difficulty) %>%
 table()
frequencyDifficulty

difficulty
  1   2   3 
397 891 362

frequencyDifficulty %>%
 prop.table()

difficulty
        1         2         3 
0.2406061 0.5400000 0.2193939

# Visualize the pie chart graph showing the total submitted attempts over the 3 levels of difficulty
pie(frequencyDifficulty,labels = frequencyDifficulty, edges=10, main = "Total count for each difficulty")

# The graph has variables including frequency for each difficulty level; 

# It tries to answer the question "How is the difficulty of the questions correlated with the total submitted attempts?";

# It could not be concluded that the higher the difficulty as the questions become more challenging , the fewer the total submitted attempts will be

# More work is needed to add the mean.totalSubmitted to the graph


# Make the plot leetcode access values from question_id


# Visualize the histogram graph showing the question id as x-axis and the count for each question as y-axis
ggplot(leetcode, aes(`question_id`)) + geom_histogram(binwidth = 5, color="black", fill="white") +
labs(title = "question_id") +
theme_bw()

# The graph has variables including question id as x-axis and the count for each question as y-axis; 

# It tries to answer the question "Could the number id of the question affect the submitted attempts for each question?";

# It could be not concluded the question_id might affect the submitted attempts for each question. Additional work is needed

# Replace the numercal value in difficulty, recode leetcode to make a boxplot
#The result should be three separate box plots for different values of year_postgrad
leetcode<-mutate(leetcode, trueDifficulty = recode(difficulty, `1` = "easy", `2` = "medium", `3` = "difficult"))

# Visualize the boxplot graph showing the relationship between question difficulty and the count for total submitted each question as y-axis
ggplot(leetcode, aes(`trueDifficulty`, `total Submitted`)) + geom_boxplot()

# The graph has variables including question difficulty and the count for total submitted; 

# It tries to answer the question "How is the difficulty of the questions correlated with the total submitted attempts";

# It could not be concluded that the higher the difficulty as the questions become more challenging , the fewer the total submitted attempts will be

Identify the variables in the dataset and the dataset

# Preview and get the column names of the dataset
head(leetcode)

# A tibble: 6 × 11
  question_id video title      link  `total Accepted` `total Submitt…`
        <dbl> <lgl> <chr>      <chr>            <dbl>            <dbl>
1        1959 NA    Minimum P… http…              262              434
2        1949 NA    Implement… http…              691             1126
3        1947 NA    Number of… http…             1976             7740
4        1946 NA    Minimum A… http…             5411            13595
5        1945 NA    Finding t… http…             5890             7460
6        1944 NA    Truncate … http…             7056             8979
# … with 5 more variables: difficulty <dbl>, isPaid <lgl>,
#   solution <chr>, rate <df[,1]>, trueDifficulty <chr>

colnames(leetcode)

 [1] "question_id"     "video"           "title"          
 [4] "link"            "total Accepted"  "total Submitted"
 [7] "difficulty"      "isPaid"          "solution"       
[10] "rate"            "trueDifficulty"

As you may see with colnames(), we have list all variables

Varible type classification:

String/char type : title, link, trueDifficult

title definition: the leetcode question description titles on the official website

link definition: offcial link to the specific question

trueDifficulty definition: here as you may know, we tranform 1 to easy difficulty, 2 to medium difficulty, 3 to hard difficulty. Usuallly, higher the difficulty, the more likely the user might spend time

Numeric type : question_id, total Accepted, total Submitted, difficulty, total Accepted(updated)

question_id definition: the number which corresponds to each question total Accepted definition: number of submitted solutions which can compile and run well

total Submitted definition: number of solutions/coding attempt submitted to the website, could be wrong

difficulty definition: in general, this represents the learning curve and how challenging a typical question could be

rate definition: we use the division between totalAccepted andd totalSubmitted to get the percentage which the submitted solution have been approved by the server

Logical type : video, isPaid

video definition: TRUE means the solution has video format, FALSE means written, NA means none

isPaid definition: TRUE means the question requires premium membership, FALSE means not requiring

Research questions

The questions are identified as below so far with this dataset:

-How is the difficulty of the questions correlated with the total submitted attempts?

-How is the acceptance rate which we have added as a new column correlate to the difficulty of the questions?

-Could the number id of the question affect the submitted attempts for each question?

-How is the video/written solution correlate to the submitted attempts and acceptance rate?

-Are the paid questions guaranteed to provide at least one form of written solution?

Hopefully, I could answer the above questions soon

#Use filter() to list all the questions without solutions
solvedQuestions <- filter(leetcode, is.na(`video`)) 
head(solvedQuestions)

# A tibble: 6 × 11
  question_id video title      link  `total Accepted` `total Submitt…`
        <dbl> <lgl> <chr>      <chr>            <dbl>            <dbl>
1        1959 NA    Minimum P… http…              262              434
2        1949 NA    Implement… http…              691             1126
3        1947 NA    Number of… http…             1976             7740
4        1946 NA    Minimum A… http…             5411            13595
5        1945 NA    Finding t… http…             5890             7460
6        1944 NA    Truncate … http…             7056             8979
# … with 5 more variables: difficulty <dbl>, isPaid <lgl>,
#   solution <chr>, rate <df[,1]>, trueDifficulty <chr>

#descend to find the questions with highest accepted rate
rateDescending <- arrange(leetcode, desc(`rate`))
head(select(leetcode, `rate`))

# A tibble: 6 × 1
  rate$`total Accepted`
                  <dbl>
1                 0.604
2                 0.614
3                 0.255
4                 0.398
5                 0.790
6                 0.786

#Hence we know could rank the questions in another order

Hw4HuidiDing