import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns


df=pd.read_csv("downloads/data.csv")
df.head()


df.shape

(48842, 10)


df[df =="?"].count()

Jobclass           2799
Job                2809
age                   0
Education-level       0
marital-status        0
race                  0
gender                0
hours-per-week        0
native-country      857
income                0
dtype: int64


df[df =="?"].count().head(2)

Jobclass    2799
Job         2809
dtype: int64


df.replace("?",np.nan).head()


list(df["Jobclass"].unique())

['Private',
 'Local-gov',
 '?',
 'Self-emp-not-inc',
 'Federal-gov',
 'State-gov',
 'Self-emp-inc',
 'Without-pay',
 'Never-worked']


list(df["Job"].unique())

['Machine-op-inspct',
 'Farming-fishing',
 'Protective-serv',
 '?',
 'Other-service',
 'Prof-specialty',
 'Craft-repair',
 'Adm-clerical',
 'Exec-managerial',
 'Tech-support',
 'Sales',
 'Priv-house-serv',
 'Transport-moving',
 'Handlers-cleaners',
 'Armed-Forces']


list(df["native-country"].unique())

['United-States',
 '?',
 'Peru',
 'Guatemala',
 'Mexico',
 'Dominican-Republic',
 'Ireland',
 'Germany',
 'Philippines',
 'Thailand',
 'Haiti',
 'El-Salvador',
 'Puerto-Rico',
 'Vietnam',
 'South',
 'Columbia',
 'Japan',
 'India',
 'Cambodia',
 'Poland',
 'Laos',
 'England',
 'Cuba',
 'Taiwan',
 'Italy',
 'Canada',
 'Portugal',
 'China',
 'Nicaragua',
 'Honduras',
 'Iran',
 'Scotland',
 'Jamaica',
 'Ecuador',
 'Yugoslavia',
 'Hungary',
 'Hong',
 'Greece',
 'Trinadad&Tobago',
 'Outlying-US(Guam-USVI-etc)',
 'France',
 'Holand-Netherlands']


df["native-country"].fillna(value=df["native-country"].mode(),inplace=True)


df["Job"].fillna(value=df["Job"].mode(),inplace=True)


df["Jobclass"].fillna(value=df["Jobclass"].mode(),inplace=True)


df.isna().sum()

Jobclass           0
Job                0
age                0
Education-level    0
marital-status     0
race               0
gender             0
hours-per-week     0
native-country     0
income             0
dtype: int64


df.income=df["income"].replace({"<=50K":0,">50K":1})


df.head()


df.dtypes

Jobclass           object
Job                object
age                 int64
Education-level    object
marital-status     object
race               object
gender             object
hours-per-week      int64
native-country     object
income              int32
dtype: object


df.income.astype("int32")
df.dtypes

Jobclass           object
Job                object
age                 int64
Education-level    object
marital-status     object
race               object
gender             object
hours-per-week      int64
native-country     object
income              int32
dtype: object


sns.heatmap(df.corr(),data=df,annot=True)
sns.set(rc={"figure.figsize":(10,7)})
plt.show()


sns.boxplot(x=df.income,y=df.age,data=df,hue="income")
plt.show()


sns.countplot(y=df["Education-level"],hue="income",data=df)

<AxesSubplot:xlabel='count', ylabel='Education-level'>


sns.countplot(y=df.Job,hue="income",data=df)
plt.show()

	Jobclass	Job	age	Education-level	marital-status	race	gender	hours-per-week	native-country	income
0	Private	Machine-op-inspct	25	11th	Never-married	Black	Male	40	United-States	0
1	Private	Farming-fishing	38	HS-grad	Married-A-spouse	White	Male	50	United-States	0
2	Local-gov	Protective-serv	28	Assoc-acdm	Married-A-spouse	White	Male	40	United-States	1
3	Private	Machine-op-inspct	44	Some-college	Married-A-spouse	Black	Male	40	United-States	1
4	?	?	18	Some-college	Never-married	White	Female	30	United-States	0

Basic Data Analysis Project

Tayyab Sarfraz

import the required libraries¶

Import required dataset¶

How many records are there in data.csv?¶

In data.csv, there are some missing values, which are denoted by ?¶

Interesting column Job and column Jobclass contains many missing values. How many records have both these columns missing?¶

Replace all ? occurrences with NaN¶

List unique enteries in columns Job, Jobclass and native-country¶

Replace NaN in columns Job, Jobclass and native-country with the most frequently occurred value.¶

income column contains strings <=50K and >50K. Replace with 0 and 1 respectively.¶

List down the data types of every column¶

Convert column income data type to int32¶

Plot the heat map of the corelation between all the numeric columns in the dataframe as shown below¶

Plot boxplot between the age and income¶

plot for Job and income¶

Basic Data Analysis Project

Tayyab Sarfraz

import the required libraries¶

Import required dataset¶

How many records are there in data.csv?¶

In data.csv, there are some missing values, which are denoted by ?¶

Interesting column Job and column Jobclass contains many missing values. How many records have both these columns missing?¶

Replace all ? occurrences with NaN¶

List unique enteries in columns Job, Jobclass and native-country¶

Replace NaN in columns Job, Jobclass and native-country with the most frequently occurred value.¶

income column contains strings <=50K and >50K. Replace with 0 and 1 respectively.¶

List down the data types of every column¶

Convert column income data type to int32¶

Plot the heat map of the corelation between all the numeric columns in the dataframe as shown below¶

Plot boxplot between the age and income¶

The Education-level and income are usually related. Plot a bar chart that shows the count of people who have a certain level in education and have income¶

plot for Job and income¶