The project analysed the popuplarity of questions ask in programing language from stackoverflow stack Exchange Data Explorer data 2008 - 2020

Data Source Stack overflow

Research Questions:

Question 1. What is the Percentage of R questions in 2020?

Question 2 What are The top five programming languages with questions between 2015 and 2020 (2020 inclusive)?

Load the data dictionary to understand the data

data_dict <- read.csv("C:/Users/MUSAAB-TECH/OneDrive/JOHNBULL/data/workspace/workspace/stack_overflow_data_dictionary.csv")
data_dict
##          Column
## 1          year
## 2           tag
## 3 num_questions
## 4    year_total
##                                                                                   Description
## 1                                                 The year the question was asked (2008-2020)
## 2 A word or phrase that describes the topic of the question, such as the programming language
## 3                                     The number of questions with a certain tag in that year
## 4                                            The total number of questions asked in that year
file <- "C:/Users/MUSAAB-TECH/OneDrive/JOHNBULL/data/workspace/workspace/stack_overflow_data.csv"

Read the file

# read file 
data <- read.csv(file)
head(data)
##   year             tag num_questions year_total
## 1 2008        treeview            69     168541
## 2 2008 scheduled-tasks            30     168541
## 3 2008  specifications            21     168541
## 4 2008       rendering            35     168541
## 5 2008       http-post             6     168541
## 6 2008   static-assert             1     168541

Chect the structure of the data

str(data)
## 'data.frame':    420066 obs. of  4 variables:
##  $ year         : int  2008 2008 2008 2008 2008 2008 2008 2008 2008 2008 ...
##  $ tag          : chr  "treeview" "scheduled-tasks" "specifications" "rendering" ...
##  $ num_questions: int  69 30 21 35 6 1 159 10 4 20 ...
##  $ year_total   : int  168541 168541 168541 168541 168541 168541 168541 168541 168541 168541 ...
summary(data)
##       year          tag            num_questions        year_total     
##  Min.   :2008   Length:420066      Min.   :     1.0   Min.   : 168541  
##  1st Qu.:2012   Class :character   1st Qu.:     2.0   1st Qu.:4787010  
##  Median :2015   Mode  :character   Median :     7.0   Median :5621997  
##  Mean   :2015                      Mean   :   142.6   Mean   :5222995  
##  3rd Qu.:2018                      3rd Qu.:    29.0   3rd Qu.:6431458  
##  Max.   :2020                      Max.   :264379.0   Max.   :6612772

Question 1. What is the Percentage of R questions in 2020

# load packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# First filter for year 2020
r_2020 <- data %>% 
    filter (year == 2020, tag == "r") %>%
# add a summary column for percentage
mutate(
    percentage = round((num_questions / year_total) * 100, 2)
  ) %>%

#select the data to display 
select(year, tag, num_questions, year_total, percentage)

# verify the result 
r_2020
##   year tag num_questions year_total percentage
## 1 2020   r         52662    5452545       0.97

Plot the result

ggplot(r_2020, aes(x = tag, y = percentage)) +
  geom_col(fill = "skyblue", show.legend = FALSE, width = 0.5) + # Bar color and width
  geom_text(
    
    # sprintf ensures it shows two decimal places as per round() function.
    aes(label = sprintf("%.2f%%", percentage)),
    vjust = -0.7,      # Adjust vertical position to be above the bar
    size = 3.5         # Adjust text size as needed
  ) +
  scale_y_continuous(
    
    #  format the y value to show with two decimal places and a "%" sign.
    labels = scales::label_number(accuracy = 0.01, suffix = "%"),

    # Expand the y-axis to give space for the text label above the bar
    expand = expansion(mult = c(0, 0.15))
  ) +
  labs(
    title = "Percentage of R Questions in 2020",
    x = "Tag", # The tag is "r"
    y = "Percentage of Questions"
  ) +
  theme_minimal(base_size = 12) + # A clean theme with a base font size
  theme(
    panel.grid.major = element_blank(),       # Remove major grid lines
    panel.grid.minor = element_blank(),       # Remove minor grid lines
    axis.line = element_line(colour = "black"), # Add back axis lines for clarity
    plot.title = element_text(hjust = 0.5)    # Center the plot title
  )

Question 2 What are The top five programming languages with questions between 2015 and 2020 (2020 inclusive)?

# Filter the data for the years 2015 to 2020
filtered_data <- data %>% filter(year >= 2015 & year <= 2020)

# Summarize the total number of questions for each tag
tag_totals <- filtered_data %>% 
  group_by(tag) %>% 
  summarise(total_questions = sum(num_questions, na.rm = TRUE)) %>% 
  arrange(desc(total_questions))

# Identify the five tags with the highest total number of questions
highest_tags <- tag_totals %>% 
  slice_max(total_questions, n = 5) %>% 
  select(tag, total_questions)

highest_tags 
## # A tibble: 5 × 2
##   tag        total_questions
##   <chr>                <int>
## 1 javascript         1373634
## 2 python             1187838
## 3 java                982747
## 4 android             737330
## 5 c#                  730045

Column chart of the top five programming languages with questtions between 2015 and 2020 (inclusive)

ggplot(highest_tags, aes(x = reorder(tag, -total_questions), y = total_questions, fill = tag)) +
  geom_col(show.legend = FALSE) + 
# show.legend = FALSE to remove cluttered  tags and legend
  labs(title = "Programing Language with the highest total number of questions 2015-2020",
       x = "Programming Language",
       y = "Total Number of Questions") +
# theme minimal: for a cleaner look 
  theme_minimal() +  
 geom_text(aes(label = total_questions), vjust = -0.5, size = 3) + 
theme(
    panel.grid.major = element_blank(), # Removes major grid lines
    panel.grid.minor = element_blank(), # Removes minor grid lines
)

The popularity trend of the top 5 programing language (Absolute totals 2010-2020)

filtered_data <- data %>%
  filter(year >= 2015 & year <= 2020 & tag %in% highest_tags$tag)

ggplot(filtered_data, aes(x = year, y = num_questions, color = tag)) +
  geom_line() +
  labs(title = "Trend of Top 5 Programming Languages (2015-2020)",
       x = "Year",
       y = "Number of Questions") +
  theme_minimal()

Comparison Between python and R questions between 2010 and 2020

# Filter the data for the years 2010 to 2020
# Filter for python and R tags
python_r_trends <- data %>%
  filter(year >= 2010 & year <= 2020) %>%
  filter(tag == "python" | tag == "r")

head(python_r_trends)
##   year    tag num_questions year_total
## 1 2010      r          2264    1970729
## 2 2010 python         27029    1970729
## 3 2011 python         42169    3473395
## 4 2011      r          5835    3473395
## 5 2012 python         64254    4787010
## 6 2012      r         12183    4787010

Plot the result

ggplot(python_r_trends, aes(x = year, y = num_questions, color = tag)) +
  geom_line() +
  labs(title = "Python vs R Popularity Trend (2010-2020)",
       x = "Year",
       y = "Number of Questions") +
  theme_minimal()

Check the popularity trend using percentage of total questions between the top programming language

# Further analysis: Percentage of total questions per year for each language
percentage_trends <- data %>%
  group_by(year) %>%
  mutate(total_questions_year = sum(num_questions)) %>%
  ungroup() %>%
  mutate(percentage_of_total = (num_questions / total_questions_year) * 100)

# Plot percentage trend for top 5 languages
ggplot(percentage_trends %>% filter(tag %in% highest_tags$tag & year >= 2015 & year <= 2020),
       aes(x = year, y = percentage_of_total, color = tag)) +
  geom_line() +
  labs(title = "Percentage of Total Questions per Year for Top 5 Languages (2015-2020)",
       x = "Year",
       y = "Percentage of Total Questions") +
  theme_minimal()

Check the popularity trend using percentage of total questions between python and R

# Further analysis: Percentage of total questions per year for python and R
# Plot percentage trend for Python and R
ggplot(percentage_trends %>% filter((tag == "python" | tag == "r") & year >= 2010 & year <= 2020),
       aes(x = year, y = percentage_of_total, color = tag)) +
  geom_line() +
  labs(title = "Percentage of Total Questions per Year for Python vs R (2010-2020)",
       x = "Year",
       y = "Percentage of Total Questions") +
  theme_minimal()

Further polishing of the charts

The popularity trend of the top 5 programing language with distinct colours (Absolute totals 2010-2020)

library(RColorBrewer)     # colour-blind-safe palettes
library(ggrepel)          # smart text placement

# pick five clearly separated hues
lang_pal <- brewer.pal(5, "Set1")          # red, blue, green, purple, orange

ggplot(filtered_data,
       aes(x = year, y = num_questions,
           colour = tag, group = tag)) +
  geom_line(size = 1.1) +
  # add labels at the line end (2020)
  geom_text_repel(
    data = filtered_data |> dplyr::filter(year == max(year)),
    aes(label = tag),
    hjust = 0, nudge_x = 0.4,            # push a little to the right
    direction = "y", segment.color = NA  # no line segment, cleaner look
  ) +
  scale_colour_manual(values = lang_pal, name = "Language") +
  coord_cartesian(clip = "off") +         # let labels stick out to the right
  theme_minimal(base_size = 12) +
  theme(plot.margin = margin(5.5, 40, 5.5, 5.5))    # room for labels
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

The popularity trend of the top 5 programing language with distinct colours (Percentage of totals questions 2010-2020)

ggplot(percentage_trends |> 
         dplyr::filter(tag %in% highest_tags$tag,
                       dplyr::between(year, 2015, 2020)),
       aes(year, percentage_of_total,
           colour = tag, group = tag)) +
  geom_line(size = 1.1) +
  geom_text_repel(
    data = subset(percentage_trends, year == max(year) & tag %in% highest_tags$tag),
    aes(label = tag),
    hjust = 0, nudge_x = 0.4, segment.color = NA
  ) +
  scale_colour_manual(values = lang_pal, name = "Language (%)") +
  coord_cartesian(clip = "off") +
  theme_minimal(base_size = 12) +
  theme(plot.margin = margin(5.5, 40, 5.5, 5.5))