mydata <- read.csv("kaggle_survey_2020_responses.csv")
#Let's remove the first row of the data which contains unnecessary headings for analyzing the data.
my_newdata<-mydata[-1, ]
#Let's take the variable Q1 which is about the age group of survey takers. We want to see how many people of which age group responded to the survey.
suppressMessages(library(sqldf))
newData1<- sqldf("SELECT Q1, COUNT(Q1) AS Responses FROM my_newdata GROUP BY Q1")
newData1
Q1 Responses
1 18-21 3469
2 22-24 3786
3 25-29 4011
4 30-34 2811
5 35-39 1991
6 40-44 1397
7 45-49 988
8 50-54 698
9 55-59 411
10 60-69 398
11 70+ 76
#Now let's create a bar plot for the variable Q1. We can see that the number of survey takers for the age group 25-29 is the highest and lowest for 70+ age group.
ggplot(data=newData1, aes(x=Q1, y=Responses, fill=Q1)) + geom_bar(stat="identity") +
labs(x="Age Group", y="Number of survey takers")
#Now let's look the variable Q2 which is gender. We want to see how many people took the survey based on gender.
suppressMessages(library(sqldf))
newData2<- sqldf("SELECT Q2, COUNT(Q2) AS Number FROM my_newdata GROUP BY Q2")
newData2
Q2 Number
1 Man 15789
2 Nonbinary 52
3 Prefer not to say 263
4 Prefer to self-describe 54
5 Woman 3878
#Now let's create a bar plot for the variable Q2. We can see that we have a high number of male survey takers compared to females whereas other groups of people are less regarding survey responding.
ggplot(data=newData2, aes(x=Q2, y=Number, fill=Q2)) + geom_bar(stat="identity") +
labs(x="Gender", y="Number of survey takers")+
theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1)) + labs(fill="Gender")
#Now let's look at the third variable Q3 which is country. We want to see the top five countries which have the highest survey takers. We can ignore `other` which means other countries.
newData3<- sqldf("SELECT Q3, COUNT(Q3) AS Number FROM my_newdata GROUP BY Q3 ORDER BY Number desc")
#We can see from the above data that India, USA, Brazil, Japan and Russia have the highest number of survey takers. So let's look at these countries.
newData31<- sqldf("SELECT*FROM newData3 WHERE Q3 in ('India', 'United States of America', 'Brazil', 'Japan', 'Russia') ")
newData31
Q3 Number
1 India 5851
2 United States of America 2237
3 Brazil 694
4 Japan 638
5 Russia 582
#India has the highest number of survey takers
ggplot(data=newData31, aes(x=Q3, y=Number, fill=Q3)) + geom_bar(stat="identity") +
labs(x="Countries", y="Number of survey takers") + labs(fill="Top Five Countries")
# Now let's look at Q4 which is about degrees.
newData4<- sqldf("SELECT Q4, COUNT(Q4) AS Number FROM my_newdata GROUP BY Q4 ")
newData4
Q4 Number
1 467
2 Bachelor’s degree 6978
3 Doctoral degree 2302
4 I prefer not to answer 399
5 Master’s degree 7859
6 No formal education past high school 240
7 Professional degree 699
8 Some college/university study without earning a bachelor’s degree 1092
#We can see that number of survey takers that have masters degree is the highest followed by bachelors degree.
ggplot(data=newData4, aes(x=Q4, y=Number, fill=Q4)) + geom_bar(stat="identity") +
labs(x="Degrees", y="Number of survey takers")+ theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + labs(fill="Degree")
#Now let's look at Q5 which is about employment status.
newData5<- sqldf("SELECT Q5, COUNT(Q5) AS Number FROM my_newdata GROUP BY Q5 ")
newData5
Q5 Number
1 759
2 Business Analyst 798
3 Currently not employed 1652
4 DBA/Database Engineer 125
5 Data Analyst 1475
6 Data Engineer 437
7 Data Scientist 2676
8 Machine Learning Engineer 1082
9 Other 1737
10 Product/Project Manager 692
11 Research Scientist 1174
12 Software Engineer 1968
13 Statistician 290
14 Student 5171
#The number of student survey takers is the highest followed by data scientist.
ggplot(data=newData5, aes(x=Q5, y=Number, fill=Q5)) + geom_bar(stat="identity") +
labs(x="Positions", y="Number of survey takers") + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())+ labs(fill="Employment Status")
#Now let's look at Q6 which is about number of years of experience.
newData6<- sqldf("SELECT Q6, COUNT(Q6) AS Number FROM my_newdata GROUP BY Q6 ")
newData6
Q6 Number
1 916
2 1-2 years 4505
3 10-20 years 1751
4 20+ years 1329
5 3-5 years 4546
6 5-10 years 2552
7 < 1 years 3313
8 I have never written code 1124
#We can see that survey takers of 3-5 years of experience and 1-2 years of experience are more in number.
ggplot(data=newData6, aes(x=Q6, y=Number, fill=Q6)) + geom_bar(stat="identity") +
labs(x="Years of Experience", y="Number of survey takers") + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +labs(fill="Years of Writing Code")
#Now let's look at variable Q7_Part_1 which is about using Python for coding.
newData7<- sqldf("SELECT Q7_Part_1, COUNT(Q7_Part_1) AS Number FROM my_newdata GROUP BY Q7_Part_1 ")
newData7
Q7_Part_1 Number
1 4506
2 Python 15530
#We can see that 15530 number of survey takers responded that they used python and 4506 used other programming languages.
ggplot(data=newData7, aes(x=Q7_Part_1, y=Number, fill=Q7_Part_1)) + geom_bar(stat="identity") +
labs(x="Python Coding Experience", y="Number of survey takers") +labs(fill="Programming Language")
#Now let's look at variable Q7_Part_2 which is about using R for coding.
newData8<- sqldf("SELECT Q7_Part_2, COUNT(Q7_Part_2) AS Number FROM my_newdata GROUP BY Q7_Part_2 ")
newData8
Q7_Part_2 Number
1 15759
2 R 4277
#We can see that 4277 number of survey takers responded that they used python and 15759 used other programming languages.
ggplot(data=newData8, aes(x=Q7_Part_2, y=Number, fill=Q7_Part_2)) + geom_bar(stat="identity") +
labs(x="R Coding Experience", y="Number of survey takers") +labs(fill=" Programming Language")
#Now let's look at variable Q7_Part_3 which is about using SQL for coding.
newData9<- sqldf("SELECT Q7_Part_3, COUNT(Q7_Part_3) AS Number FROM my_newdata GROUP BY Q7_Part_3 ")
newData9
Q7_Part_3 Number
1 12501
2 SQL 7535
#We can see that 7535 number of survey takers responded that they used SQL and 12501 used other programming languages.
ggplot(data=newData9, aes(x=Q7_Part_3, y=Number, fill=Q7_Part_3)) + geom_bar(stat="identity") +
labs(x="SQL Coding Experience", y="Number of survey takers") +labs(fill=" Programming Language")
#Now let's look at variable Q7_Part_4 which is about using SQL for coding.
newData10<- sqldf("SELECT Q7_Part_4, COUNT(Q7_Part_4) AS Number FROM my_newdata GROUP BY Q7_Part_4 ")
newData10
Q7_Part_4 Number
1 16721
2 C 3315
#We can see that 3315 number of survey takers responded that they used C and 16721 used other programming languages.
ggplot(data=newData10, aes(x=Q7_Part_4, y=Number, fill=Q7_Part_4)) + geom_bar(stat="identity") +
labs(x="C Coding Experience", y="Number of survey takers") +labs(fill=" Programming Language")
#Now I am interested to know which programming language would the survey takers recommend to the aspiring data scientist. Therefore, let's look at variable Q8.
newData11<- sqldf("SELECT Q8, COUNT(Q8) AS Number FROM my_newdata GROUP BY Q8 ")
#We can see that survey takers recommend Python the most followed by R and SQL programming languages. We can see that 2215 survey takers didn't respond to this question.
ggplot(data=newData11, aes(x=Q8, y=Number, fill=Q8)) + geom_bar(stat="identity") +
labs(x="Programming Language", y="Number of survey takers") +labs(fill=" Programming Language") + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
Findings from our study:
The number of survey takers for the age group 25-29 is the highest and lowest for 70+ age group.
There is a high number of male survey takers compared to females.
India, USA, Brazil, Japan and Russia have the highest number of survey takers.
The number of survey takers that have masters degree is the highest followed by bachelors degree.
The number of student survey takers is the highest followed by data scientist.
Survey takers of 3-5 years of experience and 1-2 years of experience are more in number.
Survey takers mostly used Python for coding followed by R , then C and other programming languages.
Survey takers recommend Python the most followed by R and SQL programming languages