import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df=pd.read_csv("downloads/data.csv")
df.head()
| Jobclass | Job | age | Education-level | marital-status | race | gender | hours-per-week | native-country | income | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Private | Machine-op-inspct | 25 | 11th | Never-married | Black | Male | 40 | United-States | <=50K |
| 1 | Private | Farming-fishing | 38 | HS-grad | Married-A-spouse | White | Male | 50 | United-States | <=50K |
| 2 | Local-gov | Protective-serv | 28 | Assoc-acdm | Married-A-spouse | White | Male | 40 | United-States | >50K |
| 3 | Private | Machine-op-inspct | 44 | Some-college | Married-A-spouse | Black | Male | 40 | United-States | >50K |
| 4 | ? | ? | 18 | Some-college | Never-married | White | Female | 30 | United-States | <=50K |
df.shape
(48842, 10)
In this dataset there are 48842 rows and 10 columns
df[df =="?"].count()
Jobclass 2799 Job 2809 age 0 Education-level 0 marital-status 0 race 0 gender 0 hours-per-week 0 native-country 857 income 0 dtype: int64
df[df =="?"].count().head(2)
Jobclass 2799 Job 2809 dtype: int64
df.replace("?",np.nan).head()
| Jobclass | Job | age | Education-level | marital-status | race | gender | hours-per-week | native-country | income | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Private | Machine-op-inspct | 25 | 11th | Never-married | Black | Male | 40 | United-States | <=50K |
| 1 | Private | Farming-fishing | 38 | HS-grad | Married-A-spouse | White | Male | 50 | United-States | <=50K |
| 2 | Local-gov | Protective-serv | 28 | Assoc-acdm | Married-A-spouse | White | Male | 40 | United-States | >50K |
| 3 | Private | Machine-op-inspct | 44 | Some-college | Married-A-spouse | Black | Male | 40 | United-States | >50K |
| 4 | NaN | NaN | 18 | Some-college | Never-married | White | Female | 30 | United-States | <=50K |
list(df["Jobclass"].unique())
['Private', 'Local-gov', '?', 'Self-emp-not-inc', 'Federal-gov', 'State-gov', 'Self-emp-inc', 'Without-pay', 'Never-worked']
list(df["Job"].unique())
['Machine-op-inspct', 'Farming-fishing', 'Protective-serv', '?', 'Other-service', 'Prof-specialty', 'Craft-repair', 'Adm-clerical', 'Exec-managerial', 'Tech-support', 'Sales', 'Priv-house-serv', 'Transport-moving', 'Handlers-cleaners', 'Armed-Forces']
list(df["native-country"].unique())
['United-States', '?', 'Peru', 'Guatemala', 'Mexico', 'Dominican-Republic', 'Ireland', 'Germany', 'Philippines', 'Thailand', 'Haiti', 'El-Salvador', 'Puerto-Rico', 'Vietnam', 'South', 'Columbia', 'Japan', 'India', 'Cambodia', 'Poland', 'Laos', 'England', 'Cuba', 'Taiwan', 'Italy', 'Canada', 'Portugal', 'China', 'Nicaragua', 'Honduras', 'Iran', 'Scotland', 'Jamaica', 'Ecuador', 'Yugoslavia', 'Hungary', 'Hong', 'Greece', 'Trinadad&Tobago', 'Outlying-US(Guam-USVI-etc)', 'France', 'Holand-Netherlands']
df["native-country"].fillna(value=df["native-country"].mode(),inplace=True)
df["Job"].fillna(value=df["Job"].mode(),inplace=True)
df["Jobclass"].fillna(value=df["Jobclass"].mode(),inplace=True)
df.isna().sum()
Jobclass 0 Job 0 age 0 Education-level 0 marital-status 0 race 0 gender 0 hours-per-week 0 native-country 0 income 0 dtype: int64
df.income=df["income"].replace({"<=50K":0,">50K":1})
df.head()
| Jobclass | Job | age | Education-level | marital-status | race | gender | hours-per-week | native-country | income | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Private | Machine-op-inspct | 25 | 11th | Never-married | Black | Male | 40 | United-States | 0 |
| 1 | Private | Farming-fishing | 38 | HS-grad | Married-A-spouse | White | Male | 50 | United-States | 0 |
| 2 | Local-gov | Protective-serv | 28 | Assoc-acdm | Married-A-spouse | White | Male | 40 | United-States | 1 |
| 3 | Private | Machine-op-inspct | 44 | Some-college | Married-A-spouse | Black | Male | 40 | United-States | 1 |
| 4 | ? | ? | 18 | Some-college | Never-married | White | Female | 30 | United-States | 0 |
df.dtypes
Jobclass object Job object age int64 Education-level object marital-status object race object gender object hours-per-week int64 native-country object income int32 dtype: object
df.income.astype("int32")
df.dtypes
Jobclass object Job object age int64 Education-level object marital-status object race object gender object hours-per-week int64 native-country object income int32 dtype: object
sns.heatmap(df.corr(),data=df,annot=True)
sns.set(rc={"figure.figsize":(10,7)})
plt.show()
This heatmap show that there is a relationship between the "age" and "income" but not much more
sns.boxplot(x=df.income,y=df.age,data=df,hue="income")
plt.show()
sns.countplot(y=df["Education-level"],hue="income",data=df)
<AxesSubplot:xlabel='count', ylabel='Education-level'>
This plot desribe the salary of different education levels , if we see that in HS-grad have most people have salary less then 50K on the other hand in the Bechelors most of the people have salary greater then 50K.
sns.countplot(y=df.Job,hue="income",data=df)
plt.show()
This plot describe that the more income in the jobs are "Pro-speciality" and "Exec-managerial" and more less salaries jobs are "Adm-clerical" and "other-services"