The code and data employed here can be found at the original repository. The data employed on this report is a sample of the commits made on some of the repositories on Github each day from 2016 to 2017.
readr::read_csv(here::here("data/github-users-committing-filetypes.csv"),
progress = FALSE,
col_types = cols(
file_extension = col_character(),
month_day = col_integer(),
the_month = col_integer(),
the_year = col_integer(),
users = col_integer()
)) -> data
data %>%
glimpse()
## Observations: 13,802
## Variables: 5
## $ file_extension <chr> "md", "md", "md", "md", "md", "md", "md", "md",...
## $ month_day <int> 18, 17, 27, 16, 26, 21, 4, 22, 23, 1, 12, 3, 2,...
## $ the_month <int> 2, 2, 1, 2, 1, 3, 2, 2, 2, 2, 4, 2, 2, 2, 4, 3,...
## $ the_year <int> 2016, 2016, 2016, 2016, 2016, 2017, 2016, 2016,...
## $ users <int> 10279, 10208, 10118, 10045, 10020, 10015, 9991,...
We will refer to popularity as the median number of users that commited files of a certain language. In other words, the more people commit files in a programming language the more popular it is.
data %>%
mutate(cronology = lubridate::ymd(paste(the_year,
the_month,
month_day)),
isWeekend = timeDate::isWeekend(cronology)) -> data
data %>%
sample_n(10)
data %>%
group_by(file_extension, isWeekend) %>%
summarise(popularity = sum(users)) %>%
ggplot(aes(x=reorder(file_extension,popularity),
y = popularity,
fill=isWeekend)) +
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle=60, hjust=1)) +
labs(x="File extension",y="Number of editing users")
data %>%
group_by(cronology,isWeekend) %>%
summarise(popularity = sum(users)) %>%
ggplot(aes(popularity,cronology,color=isWeekend)) +
geom_segment(aes(x = 0, y = cronology,
xend = popularity,
yend = cronology),
color = "grey50",
size=0.15) +
geom_point() +
coord_flip() +
labs(y="Point in Time",
x="Number of editing users")
data %>%
filter(file_extension %in% c("py","java")) %>%
group_by(file_extension, isWeekend) %>%
summarise(popularity = median(users)) %>%
ggplot(aes(x=reorder(file_extension,popularity),
y = popularity,
fill=isWeekend)) +
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle=60, hjust=1)) +
labs(x="File extension",y="Popularity")
data %>%
filter(file_extension %in% c("py","java")) %>%
ggplot(aes(users,cronology,color=isWeekend)) +
geom_segment(aes(x = 0, y = cronology,
xend = users,
yend = cronology),
color = "grey50",
size=0.15) +
geom_point() +
facet_wrap(~ file_extension,
nrow = 2) +
coord_flip() +
labs(y="Point in Time",
x="Number of editing users")
As talking about the sample isn’t enough to draw conclusions about the population (coders in Github), further into this report we will make use of statistical inference.
data %>%
filter(file_extension == "java") %>%
ggplot(aes(x="",
y=users,
group=isWeekend,
fill=isWeekend)) +
geom_boxplot() +
labs(y="Number of editing users") +
ggtitle("Java popularity (weekdays vs weekend)") +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
data %>%
filter(file_extension == "java") %>%
filter(!isWeekend) %>%
bootstrap(median(users), R = 10000) %>%
CI.percentile(probs = c(.025, .975)) -> java.week
data %>%
filter(file_extension == "java") %>%
filter(isWeekend) %>%
bootstrap(median(users), R = 10000) %>%
CI.percentile(probs = c(.025, .975)) -> java.weekend
cat(paste("Java on week days:\n"))
java.week
cat(paste("\n\nJava on weekend days:\n"))
java.weekend
## Java on week days:
## 2.5% 97.5%
## median(users) 3534 3667
##
##
## Java on weekend days:
## 2.5% 97.5%
## median(users) 2058 2167
df = data.frame(rbind(java.week,
java.weekend[rownames(java.weekend),]))
df$medida = c("JAVA (Week)", "JAVA (Weekend)")
df %>%
ggplot(aes(x = medida, ymin = X2.5., ymax = X97.5.)) +
geom_errorbar(width = .2) +
labs(y= "Popularity", x="Group")
Looking at the confidence intervals (C.I.) of Java popularity during the week and during the weekend we can say at a 95% degree of confidence that there’s a statistically significant difference between Java popularity during the week and the weekend.
data %>%
filter(file_extension == "java") -> java
b.diff.means <- bootstrap2(java$users,
treatment = java$isWeekend,
median, R = 10000)
means.diff = CI.percentile(b.diff.means, probs = c(.025, .975))
means.diff
data.frame(means.diff) %>%
ggplot(aes(x = "Difference",ymin = X2.5., ymax = X97.5.)) +
geom_errorbar(width = .2) +
geom_hline(yintercept = 0, colour = "darkorange") +
labs(x="")
## 2.5% 97.5%
## median: FALSE-TRUE 1390.5 1564.5
Looking at the confidence intervals (C.I.) of the unpaired difference between java popularity on the week and java popularity during the weekend we can say at a 95% degree of confidence that Java is more popular during the week than during the weekend.
data %>%
filter(file_extension == "py") %>%
ggplot(aes(x="",
y=users,
group=isWeekend,
fill=isWeekend)) +
geom_boxplot() +
labs(x="", y="Number of editing users") +
ggtitle("Python popularity (weekday vs weekend)") +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
data %>%
filter(file_extension == "py") %>%
filter(!isWeekend) %>%
bootstrap(median(users), R = 10000) %>%
CI.percentile(probs = c(.025, .975)) -> python.week
data %>%
filter(file_extension == "py") %>%
filter(isWeekend) %>%
bootstrap(median(users), R = 10000) %>%
CI.percentile(probs = c(.025, .975)) -> python.weekend
cat(paste("Python on week days:\n"))
python.week
cat(paste("\n\nPython on weekend days:\n"))
python.weekend
## Python on week days:
## 2.5% 97.5%
## median(users) 4667.254 4791
##
##
## Python on weekend days:
## 2.5% 97.5%
## median(users) 2523 2644
df = data.frame(rbind(python.week,
python.weekend[rownames(python.week),]))
df$medida = c("PYTHON (Week)", "PYTHON (Weekend)")
df %>%
ggplot(aes(x = medida, ymin = X2.5., ymax = X97.5.)) +
geom_errorbar(width = .2) +
labs(y= "Popularity", x="Group")
Looking at the confidence intervals (C.I.) of Python popularity during the week and the weekend we can say at a 95% degree of confidence that there’s a statistically significant difference between Python popularity during the week and Python popularity during the weekend.
data %>%
filter(file_extension == "py") -> python
b.diff.means <- bootstrap2(python$users,
treatment = python$isWeekend,
median, R = 10000)
means.diff = CI.percentile(b.diff.means, probs = c(.025, .975))
means.diff
data.frame(means.diff) %>%
ggplot(aes(x = "Difference",ymin = X2.5., ymax = X97.5.)) +
geom_errorbar(width = .2) +
geom_hline(yintercept = 0, colour = "darkorange") +
labs(x="")
## 2.5% 97.5%
## median: FALSE-TRUE 2063 2233.829
Looking at the confidence interval (C.I.) of the unpaired difference between Python popularity during the week and Python popularity during weekends we can say at a 95% degree of confidence that Python is significantly more popular during the week than during the weekend.
data %>%
filter(isWeekend) %>%
filter(file_extension %in% c("py","java")) %>%
ggplot(aes(x=file_extension,
y=users,
group=file_extension,
fill=file_extension)) +
geom_boxplot() +
ggtitle("Python vs Java (Weekends)") +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
labs(y="Number of users editing files")
cat(paste("Java on weekend days:\n"))
java.weekend
cat(paste("\n\nPython on weekend days:\n"))
python.weekend
## Java on weekend days:
## 2.5% 97.5%
## median(users) 2058 2167
##
##
## Python on weekend days:
## 2.5% 97.5%
## median(users) 2523 2644
df = data.frame(rbind(java.weekend,
python.weekend[rownames(python.week),]))
df$medida = c("JAVA (Weekend)", "PYTHON (Weekend)")
df %>%
ggplot(aes(x = medida, ymin = X2.5., ymax = X97.5.)) +
geom_errorbar(width = .2) +
labs(y= "Popularity", x="Group")
Looking at the confidence intervals (C.I.) of Java and Python popularity during the weekend we can say at a 95% degree of confidence that there’s a statistically significant difference between Java and Python popularity during the weekend.
data %>%
filter(isWeekend) %>%
filter(file_extension %in% c("py","java")) -> weekend
b.diff.means <- bootstrap2(weekend$users,
treatment = weekend$file_extension,
median, R = 10000)
means.diff = CI.percentile(b.diff.means, probs = c(.025, .975))
means.diff
data.frame(means.diff) %>%
ggplot(aes(x = "Difference",ymin = X2.5., ymax = X97.5.)) +
geom_errorbar(width = .2) +
geom_hline(yintercept = 0, colour = "darkorange") +
labs(x="")
## 2.5% 97.5%
## median: java-py -548.5 -382
Looking at the confidence intervals (C.I.) of the unpaired difference between Java and Python popularity during the weekend we can say at a 95% degree of confidence that Python is significantly more popular during the weekend than Java.