Research Questions:

Question 1. What is the Percentage of R questions in 2020?

Question 2 What are The top five programming languages with questions between 2015 and 2020 (2020 inclusive)?

Load the data dictionary to understand the data

data_dict <- read.csv("C:/Users/MUSAAB-TECH/OneDrive/JOHNBULL/data/workspace/workspace/stack_overflow_data_dictionary.csv")
data_dict

##          Column
## 1          year
## 2           tag
## 3 num_questions
## 4    year_total
##                                                                                   Description
## 1                                                 The year the question was asked (2008-2020)
## 2 A word or phrase that describes the topic of the question, such as the programming language
## 3                                     The number of questions with a certain tag in that year
## 4                                            The total number of questions asked in that year

file <- "C:/Users/MUSAAB-TECH/OneDrive/JOHNBULL/data/workspace/workspace/stack_overflow_data.csv"

Read the file

# read file 
data <- read.csv(file)
head(data)

##   year             tag num_questions year_total
## 1 2008        treeview            69     168541
## 2 2008 scheduled-tasks            30     168541
## 3 2008  specifications            21     168541
## 4 2008       rendering            35     168541
## 5 2008       http-post             6     168541
## 6 2008   static-assert             1     168541

Chect the structure of the data

str(data)

## 'data.frame':    420066 obs. of  4 variables:
##  $ year         : int  2008 2008 2008 2008 2008 2008 2008 2008 2008 2008 ...
##  $ tag          : chr  "treeview" "scheduled-tasks" "specifications" "rendering" ...
##  $ num_questions: int  69 30 21 35 6 1 159 10 4 20 ...
##  $ year_total   : int  168541 168541 168541 168541 168541 168541 168541 168541 168541 168541 ...

summary(data)

##       year          tag            num_questions        year_total     
##  Min.   :2008   Length:420066      Min.   :     1.0   Min.   : 168541  
##  1st Qu.:2012   Class :character   1st Qu.:     2.0   1st Qu.:4787010  
##  Median :2015   Mode  :character   Median :     7.0   Median :5621997  
##  Mean   :2015                      Mean   :   142.6   Mean   :5222995  
##  3rd Qu.:2018                      3rd Qu.:    29.0   3rd Qu.:6431458  
##  Max.   :2020                      Max.   :264379.0   Max.   :6612772

Question 1. What is the Percentage of R questions in 2020

# load packages
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# First filter for year 2020
r_2020 <- data %>% 
    filter (year == 2020, tag == "r") %>%
# add a summary column for percentage
mutate(
    percentage = round((num_questions / year_total) * 100, 2)
  ) %>%

#select the data to display 
select(year, tag, num_questions, year_total, percentage)

# verify the result 
r_2020

##   year tag num_questions year_total percentage
## 1 2020   r         52662    5452545       0.97

Plot the result

ggplot(r_2020, aes(x = tag, y = percentage)) +
  geom_col(fill = "skyblue", show.legend = FALSE, width = 0.5) + # Bar color and width
  geom_text(
    
    # sprintf ensures it shows two decimal places as per round() function.
    aes(label = sprintf("%.2f%%", percentage)),
    vjust = -0.7,      # Adjust vertical position to be above the bar
    size = 3.5         # Adjust text size as needed
  ) +
  scale_y_continuous(
    
    #  format the y value to show with two decimal places and a "%" sign.
    labels = scales::label_number(accuracy = 0.01, suffix = "%"),

    # Expand the y-axis to give space for the text label above the bar
    expand = expansion(mult = c(0, 0.15))
  ) +
  labs(
    title = "Percentage of R Questions in 2020",
    x = "Tag", # The tag is "r"
    y = "Percentage of Questions"
  ) +
  theme_minimal(base_size = 12) + # A clean theme with a base font size
  theme(
    panel.grid.major = element_blank(),       # Remove major grid lines
    panel.grid.minor = element_blank(),       # Remove minor grid lines
    axis.line = element_line(colour = "black"), # Add back axis lines for clarity
    plot.title = element_text(hjust = 0.5)    # Center the plot title
  )

Question 2 What are The top five programming languages with questions between 2015 and 2020 (2020 inclusive)?

# Filter the data for the years 2015 to 2020
filtered_data <- data %>% filter(year >= 2015 & year <= 2020)

# Summarize the total number of questions for each tag
tag_totals <- filtered_data %>% 
  group_by(tag) %>% 
  summarise(total_questions = sum(num_questions, na.rm = TRUE)) %>% 
  arrange(desc(total_questions))

# Identify the five tags with the highest total number of questions
highest_tags <- tag_totals %>% 
  slice_max(total_questions, n = 5) %>% 
  select(tag, total_questions)

highest_tags

## # A tibble: 5 × 2
##   tag        total_questions
##   <chr>                <int>
## 1 javascript         1373634
## 2 python             1187838
## 3 java                982747
## 4 android             737330
## 5 c#                  730045

Column chart of the top five programming languages with questtions between 2015 and 2020 (inclusive)

ggplot(highest_tags, aes(x = reorder(tag, -total_questions), y = total_questions, fill = tag)) +
  geom_col(show.legend = FALSE) + 
# show.legend = FALSE to remove cluttered  tags and legend
  labs(title = "Programing Language with the highest total number of questions 2015-2020",
       x = "Programming Language",
       y = "Total Number of Questions") +
# theme minimal: for a cleaner look 
  theme_minimal() +  
 geom_text(aes(label = total_questions), vjust = -0.5, size = 3) + 
theme(
    panel.grid.major = element_blank(), # Removes major grid lines
    panel.grid.minor = element_blank(), # Removes minor grid lines
)

The popularity trend of the top 5 programing language (Absolute totals 2010-2020)

filtered_data <- data %>%
  filter(year >= 2015 & year <= 2020 & tag %in% highest_tags$tag)

ggplot(filtered_data, aes(x = year, y = num_questions, color = tag)) +
  geom_line() +
  labs(title = "Trend of Top 5 Programming Languages (2015-2020)",
       x = "Year",
       y = "Number of Questions") +
  theme_minimal()

Comparison Between python and R questions between 2010 and 2020

# Filter the data for the years 2010 to 2020
# Filter for python and R tags
python_r_trends <- data %>%
  filter(year >= 2010 & year <= 2020) %>%
  filter(tag == "python" | tag == "r")

head(python_r_trends)

##   year    tag num_questions year_total
## 1 2010      r          2264    1970729
## 2 2010 python         27029    1970729
## 3 2011 python         42169    3473395
## 4 2011      r          5835    3473395
## 5 2012 python         64254    4787010
## 6 2012      r         12183    4787010

Plot the result

ggplot(python_r_trends, aes(x = year, y = num_questions, color = tag)) +
  geom_line() +
  labs(title = "Python vs R Popularity Trend (2010-2020)",
       x = "Year",
       y = "Number of Questions") +
  theme_minimal()

Check the popularity trend using percentage of total questions between the top programming language

# Further analysis: Percentage of total questions per year for each language
percentage_trends <- data %>%
  group_by(year) %>%
  mutate(total_questions_year = sum(num_questions)) %>%
  ungroup() %>%
  mutate(percentage_of_total = (num_questions / total_questions_year) * 100)

# Plot percentage trend for top 5 languages
ggplot(percentage_trends %>% filter(tag %in% highest_tags$tag & year >= 2015 & year <= 2020),
       aes(x = year, y = percentage_of_total, color = tag)) +
  geom_line() +
  labs(title = "Percentage of Total Questions per Year for Top 5 Languages (2015-2020)",
       x = "Year",
       y = "Percentage of Total Questions") +
  theme_minimal()

Check the popularity trend using percentage of total questions between python and R

# Further analysis: Percentage of total questions per year for python and R
# Plot percentage trend for Python and R
ggplot(percentage_trends %>% filter((tag == "python" | tag == "r") & year >= 2010 & year <= 2020),
       aes(x = year, y = percentage_of_total, color = tag)) +
  geom_line() +
  labs(title = "Percentage of Total Questions per Year for Python vs R (2010-2020)",
       x = "Year",
       y = "Percentage of Total Questions") +
  theme_minimal()

Analysis of Popularity of Progaming Languages

Johnbull Owenvbugie

The project analysed the popuplarity of questions ask in programing language from stackoverflow stack Exchange Data Explorer data 2008 - 2020

Data Source Stack overflow

Research Questions:

Read the file

Chect the structure of the data

Question 1. What is the Percentage of R questions in 2020

Plot the result

Question 2 What are The top five programming languages with questions between 2015 and 2020 (2020 inclusive)?

Column chart of the top five programming languages with questtions between 2015 and 2020 (inclusive)

The popularity trend of the top 5 programing language (Absolute totals 2010-2020)

Comparison Between python and R questions between 2010 and 2020

Plot the result

Check the popularity trend using percentage of total questions between the top programming language

Check the popularity trend using percentage of total questions between python and R

Further polishing of the charts

The popularity trend of the top 5 programing language with distinct colours (Absolute totals 2010-2020)

The popularity trend of the top 5 programing language with distinct colours (Percentage of totals questions 2010-2020)