import statistics
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
pd.set_option("display.max_rows", None, "display.max_columns", None)
df = pd.read_csv("dataset-48240.csv")
df.head()
## age black case clerical construc educ earns74 gdhlth inlf leis1 \
## 0 32 0 1 0.0 0.0 12 0 0 1 3529
## 1 31 0 2 0.0 0.0 14 9500 1 1 2140
## 2 44 0 3 0.0 0.0 17 42500 1 1 4595
## 3 30 0 4 0.0 0.0 12 42500 1 1 3211
## 4 64 0 5 0.0 0.0 14 2500 1 1 4052
##
## leis2 leis3 smsa lhrwage lothinc male marr prot rlxall selfe \
## 0 3479 3479 0 1.955861 10.075380 1 1 1 3163 0
## 1 2140 2140 0 0.357674 0.000000 1 0 1 2920 1
## 2 4505 4227 1 3.021887 0.000000 1 1 0 3038 1
## 3 3211 3211 0 2.263844 0.000000 0 1 1 3083 1
## 4 4007 4007 0 1.011601 9.328213 1 1 1 3493 0
##
## sleep slpnaps south spsepay spwrk75 totwrk union worknrm workscnd \
## 0 3113 3163 0 0 0 3438 0 3438 0
## 1 2920 2920 1 0 0 5020 0 5020 0
## 2 2670 2760 0 20000 1 2815 0 2815 0
## 3 3083 3083 0 5000 1 3786 0 3786 0
## 4 3448 3493 0 2400 1 2580 0 2580 0
##
## exper yngkid yrsmarr hrwage agesq
## 0 14 0 13 7.070004 1024
## 1 11 0 0 1.429999 961
## 2 21 0 0 20.529997 1936
## 3 12 0 12 9.619998 900
## 4 44 0 33 2.750000 4096
We change Na’s to median value
median = df['sleep'].median()
df['sleep'].fillna(median)
df['sleep'].describe()
## count 706.000000
## mean 3266.355524
## std 444.413448
## min 755.000000
## 25% 3015.000000
## 50% 3270.500000
## 75% 3532.250000
## max 4695.000000
## Name: sleep, dtype: float64
range_ = df['sleep'].max()-df['sleep'].min()
q1 = df['sleep'].quantile(0.25)
q3 = df['sleep'].quantile(0.75)
interquartile_range = q3-q1
print(range_)
## 3940
print(interquartile_range)
## 517.25
print(range_/df['sleep'].max())
## 0.8391906283280085
print(interquartile_range/df['sleep'].max())
## 0.11017039403620874
Range is huge part of maximum value so it is high and may indicate great dispersion of data. On the other hand Interquartile range is small so it may indicate that there are some outliers.
df['sleep'].var()
## 197503.313139654
variance_coefficient = df['sleep'].std()/df['sleep'].mean()
print(variance_coefficient)
## 0.13605789239637386
Coefficient of variance is well below 1, so standard deviation is low.
fig1 = plt.figure(1)
sns.distplot(df['sleep'])
plt.title("Density of 'sleep' distribution")
plt.show()
fig2 = plt.figure(2)
sns.boxplot(df['sleep'])
plt.title("Distribution of 'sleep'")
plt.show()
Data on the density plot is bell shaped. On the boxplot we can see that there are some outliers.
sleep_cor = df.corr()['sleep']
print(sleep_cor)
## age 0.090373
## black -0.027057
## case 0.094006
## clerical 0.048081
## construc 0.041229
## educ -0.095004
## earns74 -0.076890
## gdhlth -0.102825
## inlf -0.027126
## leis1 -0.154080
## leis2 -0.155630
## leis3 -0.162828
## smsa -0.066997
## lhrwage -0.067197
## lothinc 0.036661
## male -0.035909
## marr 0.053757
## prot 0.027147
## rlxall 0.867744
## selfe 0.001782
## sleep 1.000000
## slpnaps 0.893043
## south 0.078600
## spsepay 0.007881
## spwrk75 0.007868
## totwrk -0.321384
## union 0.009965
## worknrm -0.322300
## workscnd 0.001139
## exper 0.104191
## yngkid -0.013262
## yrsmarr 0.063997
## hrwage -0.049450
## agesq 0.099722
## Name: sleep, dtype: float64
from statsmodels.stats.outliers_influence import variance_inflation_factor
X_variables = df[['sleep','rlxall','slpnaps']]
vif_data = pd.DataFrame()
vif_data["feature"] = X_variables.columns
vif_data["VIF"] = [variance_inflation_factor(X_variables.values, i) for i in range(len(X_variables.columns))]
print(vif_data)
## feature VIF
## 0 sleep 231.595589
## 1 rlxall 708.546242
## 2 slpnaps 908.855472
In our further analysis we do not consider ‘rlxall’ and ‘slpnaps’, because they consist of sleep plus something so they will always be corelated.
sleep_cor.loc[['totwrk','worknrm','workscnd']]
## totwrk -0.321384
## worknrm -0.322300
## workscnd 0.001139
## Name: sleep, dtype: float64
It looks like second work does not correlate much with ‘sleep’. Total worked minutes and primary job worked minutes has some correlation with ‘sleep’.
fig3 = plt.figure(3)
sns.scatterplot(data = df, x = 'workscnd', y = 'sleep')
plt.title("Second job minutes vs sleep minutes")
plt.show()
fig4 = plt.figure(4)
sns.scatterplot(data = df, x = 'worknrm', y = 'sleep')
plt.title("Primary job minutes vs sleep minutes")
plt.show()
fig5 = plt.figure(5)
sns.scatterplot(data = df, x = 'totwrk', y = 'sleep')
plt.title("Total work minutes vs sleep minutes")
plt.show()
import scipy.stats as stats
X = df['totwrk']
y = df['sleep']
X = sm.add_constant(X)
model = sm.OLS(y,X).fit()
print(model.summary())
## OLS Regression Results
## ==============================================================================
## Dep. Variable: sleep R-squared: 0.103
## Model: OLS Adj. R-squared: 0.102
## Method: Least Squares F-statistic: 81.09
## Date: Tue, 14 Jun 2022 Prob (F-statistic): 1.99e-18
## Time: 21:12:57 Log-Likelihood: -5267.1
## No. Observations: 706 AIC: 1.054e+04
## Df Residuals: 704 BIC: 1.055e+04
## Df Model: 1
## Covariance Type: nonrobust
## ==============================================================================
## coef std err t P>|t| [0.025 0.975]
## ------------------------------------------------------------------------------
## const 3586.3770 38.912 92.165 0.000 3509.979 3662.775
## totwrk -0.1507 0.017 -9.005 0.000 -0.184 -0.118
## ==============================================================================
## Omnibus: 68.651 Durbin-Watson: 1.955
## Prob(Omnibus): 0.000 Jarque-Bera (JB): 192.044
## Skew: -0.483 Prob(JB): 1.99e-42
## Kurtosis: 5.365 Cond. No. 5.71e+03
## ==============================================================================
##
## Notes:
## [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
## [2] The condition number is large, 5.71e+03. This might indicate that there are
## strong multicollinearity or other numerical problems.
R-squared of 0.103 tells us that only 10.3% of dependent variable points can be explained by independent variable. P-value is close to 0 so our results are statistically significant.
fig6 = plt.figure(6)
sns.scatterplot(data = df,x = "totwrk",y = "sleep")
sns.lineplot(x = df['totwrk'], y = model.fittedvalues,color = "red")
plt.show()
Regression seams to be linear for most of the data (not for last few values).
res = model.resid
fig7 = plt.figure(7)
sm.qqplot(res,stats.t, fit=True, line ='s')
plt.title("Normal Q-Q plot")
plt.show()
Heavy tails of q-q plot tell us that our data has more extreme values than expected.
influence = model.get_influence()
cooksD = influence.cooks_distance
fig8 = plt.figure(8)
sns.scatterplot(x = df.index, y = cooksD[0])
plt.title("Cook's Distance")
plt.xlabel('x')
plt.ylabel('Cooks Distance')
plt.show()
Point “10” has big Cook’s D value, so we have to check its leverage.
sm.graphics.influence_plot(model, alpha = 0.05, criterion="cooks")
plt.show()
Point “10” has high leverage, but not so big residual, so it is not an influential observation.
fig9 = plt.figure(9)
sns.scatterplot(y=res,x=model.fittedvalues)
plt.title("Residuals vs fitted values")
plt.xlabel('fitted values')
plt.ylabel('residuals')
plt.axhline(0,ls='--',color='red')
plt.show()
Residuals are distributed proportionally so model is homoscedastic.