import pandas as pd # for dataframes
import numpy as np # for numerical operations
import matplotlib.pyplot as plt

# To install seaborn, in termial enter 'pip install seaborn'
import seaborn as sns

# To install sklearn, in termial enter 'pip install sklearn'
from sklearn.model_selection import train_test_split # to partition data
from sklearn.linear_model import LogisticRegression # for logistic regression
from sklearn.metrics import accuracy_score, confusion_matrix # performance matrics for accuracy score and confusion matrix

Setting the working directory

# setwd("~/IIT-M")

Reading the file

data_income = pd.read_csv('income.csv') # importing data

Making copy

raw_data = data_income.copy() # creating copy
data = data_income.copy()

Exploratory data analysis:

1. Getting to know the data

2. Data pre-processing (Missing value)

3. Cross table and data visualization

# ===============================================================================================
1. Getting to know the data
data.info() # checking variables using data type
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 31978 entries, 0 to 31977
## Data columns (total 13 columns):
## age              31978 non-null int64
## JobType          31978 non-null object
## EdType           31978 non-null object
## maritalstatus    31978 non-null object
## occupation       31978 non-null object
## relationship     31978 non-null object
## race             31978 non-null object
## gender           31978 non-null object
## capitalgain      31978 non-null int64
## capitalloss      31978 non-null int64
## hoursperweek     31978 non-null int64
## nativecountry    31978 non-null object
## SalStat          31978 non-null object
## dtypes: int64(4), object(9)
## memory usage: 3.2+ MB
Getting the sum of missing value
data.isnull().sum()
## age              0
## JobType          0
## EdType           0
## maritalstatus    0
## occupation       0
## relationship     0
## race             0
## gender           0
## capitalgain      0
## capitalloss      0
## hoursperweek     0
## nativecountry    0
## SalStat          0
## dtype: int64
Summary of dataset
data.describe()
##                 age   capitalgain   capitalloss  hoursperweek
## count  31978.000000  31978.000000  31978.000000  31978.000000
## mean      38.579023   1064.360623     86.739352     40.417850
## std       13.662085   7298.596271    401.594301     12.345285
## min       17.000000      0.000000      0.000000      1.000000
## 25%       28.000000      0.000000      0.000000     40.000000
## 50%       37.000000      0.000000      0.000000     40.000000
## 75%       48.000000      0.000000      0.000000     45.000000
## max       90.000000  99999.000000   4356.000000     99.000000
Summary of categorical variables. Including objects
data.describe(include = "O")
##          JobType    EdType  ...   nativecountry                        SalStat
## count      31978     31978  ...           31978                          31978
## unique         9        16  ...              41                              2
## top      Private   HS-grad  ...   United-States   less than or equal to 50,000
## freq       22286     10368  ...           29170                          24283
## 
## [4 rows x 9 columns]
Frequency of ‘age’
data['age'].value_counts()
## 36    875
## 23    871
## 31    870
## 34    862
## 33    862
##      ... 
## 83      6
## 85      3
## 88      3
## 87      1
## 86      1
## Name: age, Length: 73, dtype: int64
Frequency of ‘JobType’
jobtype_count = data['JobType'].value_counts()
print(jobtype_count)
##  Private             22286
##  Self-emp-not-inc     2499
##  Local-gov            2067
##  ?                    1809
##  State-gov            1279
##  Self-emp-inc         1074
##  Federal-gov           943
##  Without-pay            14
##  Never-worked            7
## Name: JobType, dtype: int64
Frequency of ‘EdType’
data['EdType'].value_counts()
##  HS-grad         10368
##  Some-college     7187
##  Bachelors        5210
##  Masters          1674
##  Assoc-voc        1366
##  11th             1167
##  Assoc-acdm       1055
##  10th              921
##  7th-8th           627
##  Prof-school       559
##  9th               506
##  12th              417
##  Doctorate         390
##  5th-6th           318
##  1st-4th           163
##  Preschool          50
## Name: EdType, dtype: int64
Frequency of ‘maritalstatus’
data['maritalstatus'].value_counts()
##  Married-civ-spouse       14692
##  Never-married            10488
##  Divorced                  4394
##  Separated                 1005
##  Widowed                    979
##  Married-spouse-absent      397
##  Married-AF-spouse           23
## Name: maritalstatus, dtype: int64
Frequency of ‘Occupation’
occ_count = data['occupation'].value_counts()
print(occ_count)
##  Prof-specialty       4038
##  Craft-repair         4030
##  Exec-managerial      3992
##  Adm-clerical         3721
##  Sales                3584
##  Other-service        3212
##  Machine-op-inspct    1966
##  ?                    1816
##  Transport-moving     1572
##  Handlers-cleaners    1350
##  Farming-fishing       989
##  Tech-support          912
##  Protective-serv       644
##  Priv-house-serv       143
##  Armed-Forces            9
## Name: occupation, dtype: int64
Frequency of ‘relationship’
data['relationship'].value_counts()
##  Husband           12947
##  Not-in-family      8156
##  Own-child          5005
##  Unmarried          3384
##  Wife               1534
##  Other-relative      952
## Name: relationship, dtype: int64
Frequency of ‘race’
data['race'].value_counts()
##  White                 27430
##  Black                  3028
##  Asian-Pac-Islander      956
##  Amer-Indian-Eskimo      311
##  Other                   253
## Name: race, dtype: int64
Frequency of ‘gender’
data['gender'].value_counts()
##  Male      21370
##  Female    10608
## Name: gender, dtype: int64
Frequency of ‘capitalgain’
data['capitalgain'].value_counts()
## 0        29320
## 15024      341
## 7688       275
## 7298       242
## 99999      152
##          ...  
## 5060         1
## 1639         1
## 2538         1
## 7978         1
## 1111         1
## Name: capitalgain, Length: 119, dtype: int64
Frequency of ‘capitalloss’
data['capitalloss'].value_counts()
## 0       30495
## 1902      197
## 1977      163
## 1887      157
## 1848       51
##         ...  
## 3004        1
## 1844        1
## 2489        1
## 2201        1
## 2472        1
## Name: capitalloss, Length: 92, dtype: int64
Frequency of ‘hoursperweek’
data['hoursperweek'].value_counts()
## 40    14932
## 50     2772
## 45     1784
## 60     1439
## 35     1277
##       ...  
## 92        1
## 94        1
## 87        1
## 74        1
## 82        1
## Name: hoursperweek, Length: 94, dtype: int64
Frequency of ‘nativecountry’
data['nativecountry'].value_counts()
##  United-States                 29170
##  Mexico                          643
##  Philippines                     198
##  Germany                         137
##  Canada                          121
##  Puerto-Rico                     114
##  El-Salvador                     106
##  India                           100
##  Cuba                             95
##  England                          90
##  Jamaica                          81
##  South                            80
##  China                            75
##  Italy                            73
##  Dominican-Republic               70
##  Vietnam                          67
##  Guatemala                        64
##  Japan                            62
##  Poland                           60
##  Columbia                         59
##  Taiwan                           51
##  Haiti                            44
##  Iran                             43
##  Portugal                         37
##  Nicaragua                        34
##  Peru                             31
##  France                           29
##  Greece                           29
##  Ecuador                          28
##  Ireland                          24
##  Hong                             20
##  Trinadad&Tobago                  19
##  Cambodia                         19
##  Thailand                         18
##  Laos                             18
##  Yugoslavia                       16
##  Outlying-US(Guam-USVI-etc)       14
##  Honduras                         13
##  Hungary                          13
##  Scotland                         12
##  Holand-Netherlands                1
## Name: nativecountry, dtype: int64
Frequency of ‘SalStat’
data['SalStat'].value_counts()
##  less than or equal to 50,000    24283
##  greater than 50,000              7695
## Name: SalStat, dtype: int64
Finding unique ‘age’
unique_age = np.unique(data['age'])
print(unique_age)
## [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
##  65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
##  90]
Finding unique ‘JobType’
np.unique(data['JobType'])
## array([' ?', ' Federal-gov', ' Local-gov', ' Never-worked', ' Private',
##        ' Self-emp-inc', ' Self-emp-not-inc', ' State-gov', ' Without-pay'],
##       dtype=object)
Finding unique ‘EdType’
np.unique(data['EdType'])
## array([' 10th', ' 11th', ' 12th', ' 1st-4th', ' 5th-6th', ' 7th-8th',
##        ' 9th', ' Assoc-acdm', ' Assoc-voc', ' Bachelors', ' Doctorate',
##        ' HS-grad', ' Masters', ' Preschool', ' Prof-school',
##        ' Some-college'], dtype=object)
Finding unique ‘maritalstatus’
np.unique(data['maritalstatus'])
## array([' Divorced', ' Married-AF-spouse', ' Married-civ-spouse',
##        ' Married-spouse-absent', ' Never-married', ' Separated',
##        ' Widowed'], dtype=object)
Finding unique ‘occupation’
np.unique(data['occupation'])
## array([' ?', ' Adm-clerical', ' Armed-Forces', ' Craft-repair',
##        ' Exec-managerial', ' Farming-fishing', ' Handlers-cleaners',
##        ' Machine-op-inspct', ' Other-service', ' Priv-house-serv',
##        ' Prof-specialty', ' Protective-serv', ' Sales', ' Tech-support',
##        ' Transport-moving'], dtype=object)
Finding unique ‘relationship’
np.unique(data['relationship'])
## array([' Husband', ' Not-in-family', ' Other-relative', ' Own-child',
##        ' Unmarried', ' Wife'], dtype=object)
Finding unique ‘race’
np.unique(data['race'])
## array([' Amer-Indian-Eskimo', ' Asian-Pac-Islander', ' Black', ' Other',
##        ' White'], dtype=object)
Finding unique ‘gender’
np.unique(data['gender'])
## array([' Female', ' Male'], dtype=object)
Finding unique ‘capitalgain’
np.unique(data['capitalgain'])
## array([    0,   114,   401,   594,   914,   991,  1055,  1086,  1111,
##         1151,  1173,  1409,  1424,  1455,  1471,  1506,  1639,  1797,
##         1831,  1848,  2009,  2036,  2050,  2062,  2105,  2174,  2176,
##         2202,  2228,  2290,  2329,  2346,  2354,  2387,  2407,  2414,
##         2463,  2538,  2580,  2597,  2635,  2653,  2829,  2885,  2907,
##         2936,  2961,  2964,  2977,  2993,  3103,  3137,  3273,  3325,
##         3411,  3418,  3432,  3456,  3464,  3471,  3674,  3781,  3818,
##         3887,  3908,  3942,  4064,  4101,  4386,  4416,  4508,  4650,
##         4687,  4787,  4865,  4931,  4934,  5013,  5060,  5178,  5455,
##         5556,  5721,  6097,  6360,  6418,  6497,  6514,  6723,  6767,
##         6849,  7298,  7430,  7443,  7688,  7896,  7978,  8614,  9386,
##         9562, 10520, 10566, 10605, 11678, 13550, 14084, 14344, 15020,
##        15024, 15831, 18481, 20051, 22040, 25124, 25236, 27828, 34095,
##        41310, 99999], dtype=int64)
Finding unique ‘capitalloss’
np.unique(data['capitalloss'])
## array([   0,  155,  213,  323,  419,  625,  653,  810,  880,  974, 1092,
##        1138, 1258, 1340, 1380, 1408, 1411, 1485, 1504, 1539, 1564, 1573,
##        1579, 1590, 1594, 1602, 1617, 1628, 1648, 1651, 1668, 1669, 1672,
##        1719, 1721, 1726, 1735, 1740, 1741, 1755, 1762, 1816, 1825, 1844,
##        1848, 1876, 1887, 1902, 1944, 1974, 1977, 1980, 2001, 2002, 2042,
##        2051, 2057, 2080, 2129, 2149, 2163, 2174, 2179, 2201, 2205, 2206,
##        2231, 2238, 2246, 2258, 2267, 2282, 2339, 2352, 2377, 2392, 2415,
##        2444, 2457, 2467, 2472, 2489, 2547, 2559, 2603, 2754, 2824, 3004,
##        3683, 3770, 3900, 4356], dtype=int64)
Finding unique ‘hoursperweek’
np.unique(data['hoursperweek'])
## array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
##        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
##        35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
##        52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
##        70, 72, 73, 74, 75, 76, 77, 78, 80, 81, 82, 84, 85, 86, 87, 88, 89,
##        90, 91, 92, 94, 95, 96, 97, 98, 99], dtype=int64)
Finding unique ‘nativecountry’
np.unique(data['nativecountry'])
## array([' Cambodia', ' Canada', ' China', ' Columbia', ' Cuba',
##        ' Dominican-Republic', ' Ecuador', ' El-Salvador', ' England',
##        ' France', ' Germany', ' Greece', ' Guatemala', ' Haiti',
##        ' Holand-Netherlands', ' Honduras', ' Hong', ' Hungary', ' India',
##        ' Iran', ' Ireland', ' Italy', ' Jamaica', ' Japan', ' Laos',
##        ' Mexico', ' Nicaragua', ' Outlying-US(Guam-USVI-etc)', ' Peru',
##        ' Philippines', ' Poland', ' Portugal', ' Puerto-Rico',
##        ' Scotland', ' South', ' Taiwan', ' Thailand', ' Trinadad&Tobago',
##        ' United-States', ' Vietnam', ' Yugoslavia'], dtype=object)
Finding unique ‘SalStat’
np.unique(data['SalStat'])
## array([' greater than 50,000', ' less than or equal to 50,000'],
##       dtype=object)
Again reading the data with ’ ?’ as ‘NaN’
data = pd.read_csv('income.csv', na_values=[" ?"])
Getting the count of null variables
data.isnull().sum()
## age                 0
## JobType          1809
## EdType              0
## maritalstatus       0
## occupation       1816
## relationship        0
## race                0
## gender              0
## capitalgain         0
## capitalloss         0
## hoursperweek        0
## nativecountry       0
## SalStat             0
## dtype: int64
Subsetting the row. Axis 1 to consider atleast one column value is missing
missing = data[data.isnull().any(axis = 1)]
print(missing)

# 1. Missing values in JobType = 1809
# 2. Missing value in Occupation = 1816
# 3. There are 1809 rows where two specific column i.e occupation and JobType have missing values
# 4. 1816-1809 = 7 Occupation is not filled for these 7 rows as JobeType is never worked
##        age JobType  ...   nativecountry                        SalStat
## 8       17     NaN  ...   United-States   less than or equal to 50,000
## 17      32     NaN  ...   United-States   less than or equal to 50,000
## 29      22     NaN  ...   United-States   less than or equal to 50,000
## 42      52     NaN  ...   United-States   less than or equal to 50,000
## 44      63     NaN  ...   United-States   less than or equal to 50,000
## ...    ...     ...  ...             ...                            ...
## 31892   59     NaN  ...   United-States            greater than 50,000
## 31934   20     NaN  ...   United-States   less than or equal to 50,000
## 31945   28     NaN  ...   United-States            greater than 50,000
## 31967   80     NaN  ...   United-States   less than or equal to 50,000
## 31968   17     NaN  ...   United-States   less than or equal to 50,000
## 
## [1816 rows x 13 columns]
data2 = data.dropna(axis = 0)
print(data2)
##        age       JobType  ...   nativecountry                        SalStat
## 0       45       Private  ...   United-States   less than or equal to 50,000
## 1       24   Federal-gov  ...   United-States   less than or equal to 50,000
## 2       44       Private  ...   United-States            greater than 50,000
## 3       27       Private  ...          Mexico   less than or equal to 50,000
## 4       20       Private  ...   United-States   less than or equal to 50,000
## ...    ...           ...  ...             ...                            ...
## 31973   34     Local-gov  ...   United-States   less than or equal to 50,000
## 31974   34     Local-gov  ...   United-States   less than or equal to 50,000
## 31975   23       Private  ...   United-States   less than or equal to 50,000
## 31976   42     Local-gov  ...   United-States   less than or equal to 50,000
## 31977   29       Private  ...   United-States   less than or equal to 50,000
## 
## [30162 rows x 13 columns]

Relationship between independent variables

correlation = data2.corr()
print(correlation)
##                    age  capitalgain  capitalloss  hoursperweek
## age           1.000000     0.080154     0.060165      0.101599
## capitalgain   0.080154     1.000000    -0.032229      0.080432
## capitalloss   0.060165    -0.032229     1.000000      0.052417
## hoursperweek  0.101599     0.080432     0.052417      1.000000

===========================================================================

Cross table and Data Visualization

===========================================================================

Exracting the column names

data2.columns
## Index(['age', 'JobType', 'EdType', 'maritalstatus', 'occupation',
##        'relationship', 'race', 'gender', 'capitalgain', 'capitalloss',
##        'hoursperweek', 'nativecountry', 'SalStat'],
##       dtype='object')

===========================================================================

Gender proportion table:

===========================================================================

pd.crosstab(index=data2['gender'],
            columns= 'Count',
            normalize=False)
## col_0    Count
## gender        
##  Female   9782
##  Male    20380
It shows that there are only 9,782 Females comparing to 20,380 of Males
pd.crosstab(index=data2['gender'],
            columns='percentage',
            normalize=True)
## col_0    percentage
## gender             
##  Female    0.324315
##  Male      0.675685
Pecentage of Females is 32 comparing to 67% of Males
pd.crosstab(columns= data2['gender'],
            index= data2['SalStat'],
            margins = True,
            normalize = True)
## gender                           Female      Male       All
## SalStat                                                    
##  greater than 50,000           0.036868  0.212055  0.248922
##  less than or equal to 50,000  0.287448  0.463630  0.751078
## All                            0.324315  0.675685  1.000000
Female and Male combined greater than 50K Salary is 24%
where as 75% Female and Male is getting less than 50K
pd.crosstab(columns = data2['gender'],
        index = data2['SalStat'],
        margins = True,
        normalize = 'index')
## gender                           Female      Male
## SalStat                                          
##  greater than 50,000           0.148109  0.851891
##  less than or equal to 50,000  0.382714  0.617286
## All                            0.324315  0.675685

Frequency distribution of ‘SalStat’

sns.countplot(data2['SalStat'])

sns.displot(data2['age'], kde=True, bins = 10)