This case study is taken from Supervised Machine Learning Case Studies in R by Julia Silge.
Print the stackoverflow object.
In the calls to count(), check out the distributions for remote status first, and then country.
library(tidyverse)
stackoverflow <- read_csv("data/stackoverflow.csv")
glimpse function to view the data# Print stackoverflow
glimpse(stackoverflow)
Rows: 6,991
Columns: 22
$ Respondent <dbl> 3, 15, 18, 19, 26, 55, 62, 71,…
$ Country <chr> "United Kingdom", "United King…
$ Salary <dbl> 113750.000, 100000.000, 130000…
$ YearsCodedJob <dbl> 20, 20, 20, 3, 16, 4, 1, 1, 20…
$ OpenSource <lgl> TRUE, FALSE, TRUE, FALSE, FALS…
$ Hobby <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, …
$ CompanySizeNumber <dbl> 10000, 5000, 1000, 10000, 1000…
$ Remote <chr> "Not remote", "Remote", "Remot…
$ CareerSatisfaction <dbl> 8, 8, 9, 5, 7, 9, 5, 8, 8, 10,…
$ `Data scientist` <lgl> FALSE, FALSE, FALSE, FALSE, FA…
$ `Database administrator` <lgl> FALSE, FALSE, FALSE, FALSE, FA…
$ `Desktop applications developer` <lgl> FALSE, FALSE, FALSE, FALSE, FA…
$ `Developer with stats/math background` <lgl> FALSE, FALSE, FALSE, FALSE, FA…
$ DevOps <lgl> FALSE, FALSE, TRUE, FALSE, FAL…
$ `Embedded developer` <lgl> FALSE, TRUE, TRUE, FALSE, FALS…
$ `Graphic designer` <lgl> FALSE, FALSE, FALSE, FALSE, FA…
$ `Graphics programming` <lgl> FALSE, FALSE, FALSE, FALSE, FA…
$ `Machine learning specialist` <lgl> FALSE, FALSE, FALSE, FALSE, FA…
$ `Mobile developer` <lgl> FALSE, FALSE, FALSE, FALSE, FA…
$ `Quality assurance engineer` <lgl> FALSE, FALSE, TRUE, FALSE, FAL…
$ `Systems administrator` <lgl> FALSE, FALSE, FALSE, FALSE, FA…
$ `Web developer` <lgl> FALSE, FALSE, TRUE, TRUE, TRUE…
count(stackoverflow,Remote, sort = TRUE)
# A tibble: 2 x 2
Remote n
<chr> <int>
1 Not remote 6273
2 Remote 718
# then count for Country
count(stackoverflow,Country, sort = TRUE)
# A tibble: 5 x 2
Country n
<chr> <int>
1 United States 3486
2 United Kingdom 1270
3 Germany 950
4 India 666
5 Canada 619
Use the appropriate column from the data set so you can plot a boxplot with remote status on the x-axis and professional experience on the y-axis.
ggplot(stackoverflow,
aes(x = Remote, y = YearsCodedJob)) +
geom_boxplot() +
labs(x = NULL,
y = "Years of professional coding experience")
Remove the Respondent column
convert the Remote variable to a 0-1 variable using mutate
stackoverflow <- select(stackoverflow, -Respondent)
stackoverflow <- mutate(stackoverflow, Remote = ifelse(Remote == "Remote", 1,0))
# Split the data into training and testing sets
library(rsample)
set.seed(1234)
stack_split <- initial_split(stackoverflow, prop = 0.8)
stack_train <- training(stack_split)
stack_test <- testing(stack_split)
# Build a simple logistic regression model
simple_glm <- glm(Remote~.,
family = "binomial",
data =stack_train)
summary(simple_glm)
Call:
glm(formula = Remote ~ ., family = "binomial", data = stack_train)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.1993 -0.4991 -0.3845 -0.2902 2.9282
Coefficients:
Estimate Std. Error z value
(Intercept) -3.929e+00 3.246e-01 -12.103
CountryGermany -3.083e-01 2.402e-01 -1.283
CountryIndia 7.780e-01 2.466e-01 3.155
CountryUnited Kingdom -4.925e-02 2.157e-01 -0.228
CountryUnited States 4.564e-01 1.960e-01 2.329
Salary 4.632e-06 1.775e-06 2.610
YearsCodedJob 6.587e-02 8.476e-03 7.771
OpenSourceTRUE 4.647e-01 9.532e-02 4.875
HobbyTRUE 3.251e-02 1.110e-01 0.293
CompanySizeNumber -6.976e-05 1.391e-05 -5.016
CareerSatisfaction 5.524e-02 2.976e-02 1.856
`Data scientist`TRUE -1.600e-01 2.085e-01 -0.768
`Database administrator`TRUE 3.070e-01 1.426e-01 2.152
`Desktop applications developer`TRUE -3.062e-01 1.108e-01 -2.763
`Developer with stats/math background`TRUE 9.944e-02 1.497e-01 0.664
DevOpsTRUE -1.731e-01 1.451e-01 -1.193
`Embedded developer`TRUE -2.264e-01 1.811e-01 -1.250
`Graphic designer`TRUE -1.936e-01 3.088e-01 -0.627
`Graphics programming`TRUE 1.316e-01 2.589e-01 0.508
`Machine learning specialist`TRUE -2.142e-01 3.128e-01 -0.685
`Mobile developer`TRUE 1.904e-01 1.144e-01 1.664
`Quality assurance engineer`TRUE -1.741e-01 2.588e-01 -0.673
`Systems administrator`TRUE 8.991e-02 1.600e-01 0.562
`Web developer`TRUE 9.265e-02 1.108e-01 0.836
Pr(>|z|)
(Intercept) < 2e-16 ***
CountryGermany 0.19936
CountryIndia 0.00161 **
CountryUnited Kingdom 0.81941
CountryUnited States 0.01988 *
Salary 0.00906 **
YearsCodedJob 7.77e-15 ***
OpenSourceTRUE 1.09e-06 ***
HobbyTRUE 0.76962
CompanySizeNumber 5.27e-07 ***
CareerSatisfaction 0.06347 .
`Data scientist`TRUE 0.44276
`Database administrator`TRUE 0.03137 *
`Desktop applications developer`TRUE 0.00573 **
`Developer with stats/math background`TRUE 0.50661
DevOpsTRUE 0.23299
`Embedded developer`TRUE 0.21136
`Graphic designer`TRUE 0.53066
`Graphics programming`TRUE 0.61121
`Machine learning specialist`TRUE 0.49356
`Mobile developer`TRUE 0.09605 .
`Quality assurance engineer`TRUE 0.50107
`Systems administrator`TRUE 0.57411
`Web developer`TRUE 0.40300
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 3696.2 on 5592 degrees of freedom
Residual deviance: 3421.3 on 5569 degrees of freedom
AIC: 3469.3
Number of Fisher Scoring iterations: 5
stack_train$simple_glm <- predict(simple_glm, newdata = stack_train,
type = "response")
ggplot(stack_train, aes(x = simple_glm)) +
geom_histogram(fill = "skyblue", colour = "black") + facet_wrap(~Remote, ncol = 1,
scales = "free_y")
A density plot
ggplot(stack_train, aes(x = simple_glm)) +
geom_density() + facet_wrap(~Remote, ncol = 1,
scales = "free_y")
stack_train2 <- training(stack_split)
library(rpart)
simple_tree <- rpart(Remote~., data = stack_train2)
simple_tree
n= 5593
node), split, n, deviance, yval
* denotes terminal node
1) root 5593 514.29640 0.10244950
2) Salary< 88944 3824 250.93620 0.07060669
4) CompanySizeNumber>=5.5 3496 194.74340 0.05921053 *
5) CompanySizeNumber< 5.5 328 50.89939 0.19207320 *
3) Salary>=88944 1769 251.10120 0.17128320
6) CompanySizeNumber>=300 940 97.12766 0.11702130 *
7) CompanySizeNumber< 300 829 148.06760 0.23281060 *
library(rpart.plot)
rpart.plot(simple_tree)
##Apply model to Test set
stack_test$simple_glm<-predict(simple_glm, newdata=stack_test, type = "response")
ggplot(data = stack_test, aes(x = factor(Remote), y = simple_glm)) + geom_boxplot()