library(reticulate)

Loading Data and preparing it for plotting


import numpy as np
import pandas as pd 
import os 
import re

os.chdir('C:/Users/DellPC/Desktop/Corner/Py_source_code/Project/kaggle-survey-2019')


kaggle_multiple_choice = pd.read_csv('multiple_choice_responses.csv', low_memory = False)

kaggle_multiple_choice.head(3)

##   Time from Start to Finish (seconds)  ...                                     Q34_OTHER_TEXT
## 0               Duration (in seconds)  ...  Which of the following relational database pro...
## 1                                 510  ...                                                 -1
## 2                                 423  ...                                                 -1
## 
## [3 rows x 246 columns]

kaggle = kaggle_multiple_choice.iloc[1:, :]

kaggle.head()

##   Time from Start to Finish (seconds)     Q1  ... Q34_Part_12 Q34_OTHER_TEXT
## 1                                 510  22-24  ...         NaN             -1
## 2                                 423  40-44  ...         NaN             -1
## 3                                  83  55-59  ...         NaN             -1
## 4                                 391  40-44  ...         NaN             -1
## 5                                 392  22-24  ...         NaN             -1
## 
## [5 rows x 246 columns]


# removing everyone that took less than 3 minutes or more than 600 minutes to answer the survey 
answers_before = kaggle.shape[0]

print(f'Initial dataset length is {answers_before} answer.')

# Creating a mask to identify those who took less than 3 min

## Initial dataset length is 19717 answer.

less_3_minutes = kaggle[round(kaggle.iloc[:,0].astype(int)/60) <= 3].index

# Dropping those rows 

kaggle = kaggle.drop(less_3_minutes, axis =0)

# Creating a mask to identify those who took more than 600 min 

more_600_minutes = kaggle[round(kaggle.iloc[:,0].astype(int)/60) >= 600].index

kaggle = kaggle.drop(more_600_minutes, axis =0)

answers_after = kaggle.shape[0]

print('After removing respondents that took less than 3 minutes or more than 600 minutes' \
               f' to answer the survey we were left with {answers_after} answers.')

## After removing respondents that took less than 3 minutes or more than 600 minutes to answer the survey we were left with 16129 answers.


# Removing those who didn't disclose compensation (Q10 is NaN)

answers_before = kaggle.shape[0]
kaggle.dropna(subset =['Q10'], inplace=True)

answers_after = kaggle.shape[0]
print(f'After removing respondents who did not disclose compensation there were left {answers_after} answer')

## After removing respondents who did not disclose compensation there were left 11402 answer


# Now lets group some data 

kaggle.Q5.value_counts()

## Data Scientist             3377
## Software Engineer          2164
## Other                      1345
## Data Analyst               1284
## Research Scientist         1157
## Product/Project Manager     605
## Business Analyst            602
## Data Engineer               503
## Statistician                236
## DBA/Database Engineer       129
## Name: Q5, dtype: int64


# Groupping DBA + Data Engineering

kaggle.Q5 =  kaggle.Q5.replace('DBA/ Database Engineer', 'Data Engineer/DBA')
kaggle.Q5 = kaggle.Q5.replace('Data Engineer', 'Data Engineer/DBA')
kaggle.Q5.value_counts()

## Data Scientist             3377
## Software Engineer          2164
## Other                      1345
## Data Analyst               1284
## Research Scientist         1157
## Product/Project Manager     605
## Business Analyst            602
## Data Engineer/DBA           503
## Statistician                236
## DBA/Database Engineer       129
## Name: Q5, dtype: int64


# Groupping Statistician + Research Scientist 

kaggle.Q5 = kaggle.Q5.replace('Statistician', 'Statistician/ Research Scientist')
kaggle.Q5 = kaggle.Q5.replace('Research Scientist', 'Statistician/ Research Scientist')

kaggle.Q5.value_counts()

## Data Scientist                      3377
## Software Engineer                   2164
## Statistician/ Research Scientist    1393
## Other                               1345
## Data Analyst                        1284
## Product/Project Manager              605
## Business Analyst                     602
## Data Engineer/DBA                    503
## DBA/Database Engineer                129
## Name: Q5, dtype: int64


# Simplifying country names 

kaggle.Q3 = kaggle.Q3.replace('United Kingdom of Great Britain and Northern Ireland', 'United Kingdom')
kaggle.Q3 = kaggle.Q3.replace('United States of Amercia', 'United States')


# Now lets rename some columns to have more meaningful names 

kaggle.columns = kaggle.columns.str.replace('Q5', 'JobTitle')
kaggle.columns = kaggle.columns.str.replace('Q3', 'Country')
kaggle['Count'] = 1


# Filtering only the columns we will need

kaggle = kaggle[['Country', 'JobTitle', 'Count']]

kaggle.head(10)

##                      Country                          JobTitle  Count
## 1                     France                 Software Engineer      1
## 2                      India                 Software Engineer      1
## 4                  Australia                             Other      1
## 5                      India                             Other      1
## 6                     France                    Data Scientist      1
## 7                      India                    Data Scientist      1
## 8   United States of America                    Data Scientist      1
## 10               Netherlands                             Other      1
## 12                   Germany  Statistician/ Research Scientist      1
## 13                   Germany                    Data Scientist      1

Data to Plot

My idea was to measure how many people are working in different positions (job titles) comparing different countries


plot_data = kaggle.groupby(['JobTitle'], as_index=False).Count.sum()

plot_data

##                            JobTitle  Count
## 0                  Business Analyst    602
## 1             DBA/Database Engineer    129
## 2                      Data Analyst   1284
## 3                 Data Engineer/DBA    503
## 4                    Data Scientist   3377
## 5                             Other   1345
## 6           Product/Project Manager    605
## 7                 Software Engineer   2164
## 8  Statistician/ Research Scientist   1393


# Grouping it by job title and country 

plot_data = kaggle.groupby(['JobTitle', 'Country'], as_index = False).Count.sum()

plot_data

##                              JobTitle                   Country  Count
## 0                    Business Analyst                   Algeria      1
## 1                    Business Analyst                 Argentina      7
## 2                    Business Analyst                 Australia      9
## 3                    Business Analyst                   Austria      1
## 4                    Business Analyst                Bangladesh      4
## ..                                ...                       ...    ...
## 486  Statistician/ Research Scientist                    Turkey     25
## 487  Statistician/ Research Scientist                   Ukraine     16
## 488  Statistician/ Research Scientist            United Kingdom     43
## 489  Statistician/ Research Scientist  United States of America    211
## 490  Statistician/ Research Scientist                  Viet Nam     10
## 
## [491 rows x 3 columns]

First Plot

## 'bar_chart.html'

htmltools::includeHTML("bar_chart.html")

##                              JobTitle                   Country  Count
## 0                    Business Analyst                   Algeria      1
## 1                    Business Analyst                 Argentina      7
## 2                    Business Analyst                 Australia      9
## 3                    Business Analyst                   Austria      1
## 4                    Business Analyst                Bangladesh      4
## ..                                ...                       ...    ...
## 486  Statistician/ Research Scientist                    Turkey     25
## 487  Statistician/ Research Scientist                   Ukraine     16
## 488  Statistician/ Research Scientist            United Kingdom     43
## 489  Statistician/ Research Scientist  United States of America    211
## 490  Statistician/ Research Scientist                  Viet Nam     10
## 
## [491 rows x 3 columns]

## 'bar_stack_chart.html'

htmltools::includeHTML('line.html')

htmltools::includeHTML('line_color.html')

htmltools::includeHTML('polar.html')

htmltools::includeHTML('radar.html')

htmltools::includeHTML('polar_adjust.html')

htmltools::includeHTML('polar_adjust1.html')

htmltools::includeHTML('final.html')

Create Wind

Nguyen_LSCM

8/21/2020

Loading Data and preparing it for plotting

Data to Plot

First Plot