library(reticulate)
import numpy as np
import pandas as pd
import os
import re
os.chdir('C:/Users/DellPC/Desktop/Corner/Py_source_code/Project/kaggle-survey-2019')
kaggle_multiple_choice = pd.read_csv('multiple_choice_responses.csv', low_memory = False)
kaggle_multiple_choice.head(3)
## Time from Start to Finish (seconds) ... Q34_OTHER_TEXT
## 0 Duration (in seconds) ... Which of the following relational database pro...
## 1 510 ... -1
## 2 423 ... -1
##
## [3 rows x 246 columns]
kaggle = kaggle_multiple_choice.iloc[1:, :]
kaggle.head()
## Time from Start to Finish (seconds) Q1 ... Q34_Part_12 Q34_OTHER_TEXT
## 1 510 22-24 ... NaN -1
## 2 423 40-44 ... NaN -1
## 3 83 55-59 ... NaN -1
## 4 391 40-44 ... NaN -1
## 5 392 22-24 ... NaN -1
##
## [5 rows x 246 columns]
# removing everyone that took less than 3 minutes or more than 600 minutes to answer the survey
answers_before = kaggle.shape[0]
print(f'Initial dataset length is {answers_before} answer.')
# Creating a mask to identify those who took less than 3 min
## Initial dataset length is 19717 answer.
less_3_minutes = kaggle[round(kaggle.iloc[:,0].astype(int)/60) <= 3].index
# Dropping those rows
kaggle = kaggle.drop(less_3_minutes, axis =0)
# Creating a mask to identify those who took more than 600 min
more_600_minutes = kaggle[round(kaggle.iloc[:,0].astype(int)/60) >= 600].index
kaggle = kaggle.drop(more_600_minutes, axis =0)
answers_after = kaggle.shape[0]
print('After removing respondents that took less than 3 minutes or more than 600 minutes' \
f' to answer the survey we were left with {answers_after} answers.')
## After removing respondents that took less than 3 minutes or more than 600 minutes to answer the survey we were left with 16129 answers.
# Removing those who didn't disclose compensation (Q10 is NaN)
answers_before = kaggle.shape[0]
kaggle.dropna(subset =['Q10'], inplace=True)
answers_after = kaggle.shape[0]
print(f'After removing respondents who did not disclose compensation there were left {answers_after} answer')
## After removing respondents who did not disclose compensation there were left 11402 answer
# Now lets group some data
kaggle.Q5.value_counts()
## Data Scientist 3377
## Software Engineer 2164
## Other 1345
## Data Analyst 1284
## Research Scientist 1157
## Product/Project Manager 605
## Business Analyst 602
## Data Engineer 503
## Statistician 236
## DBA/Database Engineer 129
## Name: Q5, dtype: int64
# Groupping DBA + Data Engineering
kaggle.Q5 = kaggle.Q5.replace('DBA/ Database Engineer', 'Data Engineer/DBA')
kaggle.Q5 = kaggle.Q5.replace('Data Engineer', 'Data Engineer/DBA')
kaggle.Q5.value_counts()
## Data Scientist 3377
## Software Engineer 2164
## Other 1345
## Data Analyst 1284
## Research Scientist 1157
## Product/Project Manager 605
## Business Analyst 602
## Data Engineer/DBA 503
## Statistician 236
## DBA/Database Engineer 129
## Name: Q5, dtype: int64
# Groupping Statistician + Research Scientist
kaggle.Q5 = kaggle.Q5.replace('Statistician', 'Statistician/ Research Scientist')
kaggle.Q5 = kaggle.Q5.replace('Research Scientist', 'Statistician/ Research Scientist')
kaggle.Q5.value_counts()
## Data Scientist 3377
## Software Engineer 2164
## Statistician/ Research Scientist 1393
## Other 1345
## Data Analyst 1284
## Product/Project Manager 605
## Business Analyst 602
## Data Engineer/DBA 503
## DBA/Database Engineer 129
## Name: Q5, dtype: int64
# Simplifying country names
kaggle.Q3 = kaggle.Q3.replace('United Kingdom of Great Britain and Northern Ireland', 'United Kingdom')
kaggle.Q3 = kaggle.Q3.replace('United States of Amercia', 'United States')
# Now lets rename some columns to have more meaningful names
kaggle.columns = kaggle.columns.str.replace('Q5', 'JobTitle')
kaggle.columns = kaggle.columns.str.replace('Q3', 'Country')
kaggle['Count'] = 1
# Filtering only the columns we will need
kaggle = kaggle[['Country', 'JobTitle', 'Count']]
kaggle.head(10)
## Country JobTitle Count
## 1 France Software Engineer 1
## 2 India Software Engineer 1
## 4 Australia Other 1
## 5 India Other 1
## 6 France Data Scientist 1
## 7 India Data Scientist 1
## 8 United States of America Data Scientist 1
## 10 Netherlands Other 1
## 12 Germany Statistician/ Research Scientist 1
## 13 Germany Data Scientist 1
My idea was to measure how many people are working in different positions (job titles) comparing different countries
plot_data = kaggle.groupby(['JobTitle'], as_index=False).Count.sum()
plot_data
## JobTitle Count
## 0 Business Analyst 602
## 1 DBA/Database Engineer 129
## 2 Data Analyst 1284
## 3 Data Engineer/DBA 503
## 4 Data Scientist 3377
## 5 Other 1345
## 6 Product/Project Manager 605
## 7 Software Engineer 2164
## 8 Statistician/ Research Scientist 1393
# Grouping it by job title and country
plot_data = kaggle.groupby(['JobTitle', 'Country'], as_index = False).Count.sum()
plot_data
## JobTitle Country Count
## 0 Business Analyst Algeria 1
## 1 Business Analyst Argentina 7
## 2 Business Analyst Australia 9
## 3 Business Analyst Austria 1
## 4 Business Analyst Bangladesh 4
## .. ... ... ...
## 486 Statistician/ Research Scientist Turkey 25
## 487 Statistician/ Research Scientist Ukraine 16
## 488 Statistician/ Research Scientist United Kingdom 43
## 489 Statistician/ Research Scientist United States of America 211
## 490 Statistician/ Research Scientist Viet Nam 10
##
## [491 rows x 3 columns]
## 'bar_chart.html'
htmltools::includeHTML("bar_chart.html")
## JobTitle Country Count
## 0 Business Analyst Algeria 1
## 1 Business Analyst Argentina 7
## 2 Business Analyst Australia 9
## 3 Business Analyst Austria 1
## 4 Business Analyst Bangladesh 4
## .. ... ... ...
## 486 Statistician/ Research Scientist Turkey 25
## 487 Statistician/ Research Scientist Ukraine 16
## 488 Statistician/ Research Scientist United Kingdom 43
## 489 Statistician/ Research Scientist United States of America 211
## 490 Statistician/ Research Scientist Viet Nam 10
##
## [491 rows x 3 columns]
## 'bar_stack_chart.html'
htmltools::includeHTML('line.html')
htmltools::includeHTML('line_color.html')
htmltools::includeHTML('polar.html')
htmltools::includeHTML('radar.html')
htmltools::includeHTML('polar_adjust.html')
htmltools::includeHTML('polar_adjust1.html')
htmltools::includeHTML('final.html')