Data

##    tot_income     tot_expenses   
##  Min.   :   24   Min.   :   120  
##  1st Qu.:  600   1st Qu.:   900  
##  Median : 1500   Median :  1500  
##  Mean   : 1995   Mean   :  2088  
##  3rd Qu.: 3000   3rd Qu.:  3000  
##  Max.   :25000   Max.   :100000
## [1] 6989    2

Data Visualization

Histogram

Using R base (with the number of bins corresponding to the square root of the number of observations in order to have more bins than the default option):

p1<- HH_data %>% ggplot( aes(x = tot_income)   ) +
  geom_histogram(bins = sqrt(7514), fill = "#FF6666") +
  theme_minimal()

p2 <- HH_data %>% ggplot( aes(x = tot_expenses)   ) +
  geom_histogram(bins = sqrt(7514), fill = "#FF6666") +
  scale_y_continuous(trans = "log2")+
  theme_minimal()

gridExtra::grid.arrange(p1, p2, nrow = 1)

Boxplot

p1 <- ggplot(HH_data) +
  aes(x = "", y = tot_income) +
  geom_boxplot(fill = "#0c4c8a") +
  theme_minimal()


p2 <- ggplot(HH_data) +
  aes(x = "", y = tot_expenses) +
  geom_boxplot(fill = "#0c4c8a") +
  theme_minimal()

gridExtra::grid.arrange(p1, p2, nrow = 1)

p1 <-  HH_data %>% 
   ggplot( aes(x=  tot_income, y = tot_expenses )) + 
 geom_point(color='#FF6666' ) +
  geom_smooth(method = "lm", se = FALSE, lty = "dashed") +
   ggtitle("Total Income and Total Expenses")
p1
## `geom_smooth()` using formula 'y ~ x'

p2 <-  HH_data %>% 
     ggplot( aes(x=  tot_income, y = tot_expenses )) + 
  geom_point(alpha = .2) + 
  geom_density2d()

p3 <- HH_data %>% 
     ggplot( aes(x=  tot_income, y = tot_expenses )) + 
  geom_hex(bins = 50, show.legend = FALSE)

gridExtra::grid.arrange(  p2, p3, nrow = 1)

log-transformed

p1 <-  HH_data %>% 
   ggplot( aes(x=  tot_income, y = tot_expenses )) + 
 geom_point(color='#FF6666',alpha = .3) +
  geom_smooth(method = "lm", se = FALSE, color = "red", lty = "dashed") +
   ggtitle("Non-transformed variables")


p2 <-  HH_data %>% 
     ggplot( aes(x=  tot_income, y = tot_expenses )) + 

  geom_point(alpha = .3,color='#FF6666') +
  geom_smooth(method = "lm", se = FALSE, color = "red", lty = "dashed") +
   scale_x_log10() +
  scale_y_log10() +
  ggtitle("log-transformed variables")

gridExtra::grid.arrange(p1, p2, nrow = 1)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

Modiling

Model1<-lm(HH_data$tot_expenses ~HH_data$tot_income)
method Residual_standard_erro Adjusted_R_squared
Original model 1996.369 0.3474183

Outlier Handling

. An outlier is a value or an observation that is distant from other observations

. A data point that differs significantly from other data points.

Removing Outliers

Using boxplot (IQR) Method

outliers <- boxplot(HH_data$tot_income, plot=FALSE)$out
dataWithoutOutlier <- HH_data[-which(HH_data$tot_income %in% outliers),]

outliers2 <- boxplot(dataWithoutOutlier$tot_expenses, plot=FALSE)$out
dataWithoutOutlier <- dataWithoutOutlier[-which(dataWithoutOutlier$tot_expenses %in% outliers2),]

#Model > After remove outliers Using boxplot
dim(dataWithoutOutlier)
## [1] 6674    2
Model_box_blot<-lm(data = dataWithoutOutlier , formula=tot_expenses ~tot_income)
method Residual_standard_erro Adjusted_R_squared
Original model 1996.3692 0.3474183
Remove Outliers Using Boxplot 694.0943 0.6696293

Using Z-score Method

df <-data.frame(HH_data$tot_income, HH_data$tot_expenses)
z_scores <- as.data.frame(sapply(df, function(df) (abs(df-mean(df))/sd(df))))

no_outliers2 <- df[!rowSums(z_scores>2), ]
dim(no_outliers2)
## [1] 6620    2
Model_z_score <-lm(no_outliers2$HH_data.tot_expenses  ~ no_outliers2$HH_data.tot_income)
method Residual_standard_erro Adjusted_R_squared
Original model 1996.3692 0.3474183
Remove Outliers Using Boxplot 694.0943 0.6696293
Remove Outliers Using Z-score Method 741.2360 0.6426958

Transform outliers

Quantile based flooring and capping

In this technique, the outlier is capped at a certain value above the 90th percentile value or floored at a factor below the 10th percentile value.

# Computing 10th, 90th percentiles and replacing the outliers for tot_income
 tenth_percentile  = quantile(HH_data$tot_income  , c(.1)) 
 ninetieth_percentile = quantile(HH_data$tot_income  , c(.9)) 

 
 # Computing 10th, 90th percentiles and replacing the outliers for tot_income
 tenth_percentile_tot_expenses  = quantile(HH_data$tot_expenses  , c(.1)) 
 ninetieth_percentile_tot_expenses = quantile(HH_data$tot_expenses  , c(.9)) 

 
 
flooringCappingData <- HH_data %>% 
  mutate(tot_income = case_when(
   tot_income < tenth_percentile ~ tenth_percentile , 
   tot_income > ninetieth_percentile ~ ninetieth_percentile , 
   TRUE ~ tot_income
  ) , 
  
  tot_expenses = case_when(
   tot_expenses < tenth_percentile_tot_expenses ~ tenth_percentile_tot_expenses , 
   tot_expenses > ninetieth_percentile_tot_expenses ~ ninetieth_percentile_tot_expenses , 
   TRUE ~ tot_expenses
  )
  
  )   

plot Data

p1 <- ggplot(flooringCappingData) +
  aes(x = "", y = tot_income) +
  geom_boxplot(fill = "#0c4c8a") +
  theme_minimal()


p2 <- ggplot(flooringCappingData) +
  aes(x = "", y = tot_expenses) +
  geom_boxplot(fill = "#0c4c8a") +
  theme_minimal()

gridExtra::grid.arrange(p1, p2, nrow = 1)

flooringCappingModel <- lm(tot_expenses ~tot_income, data = flooringCappingData )
method Residual_standard_erro Adjusted_R_squared
Original model 1996.3692 0.3474183
Remove Outliers Using Boxplot 694.0943 0.6696293
Remove Outliers Using Z-score Method 741.2360 0.6426958
Quantile based flooring and capping 617.4033 0.7195258

Modeling with outliers

Robust Regression

Robust regression is a method we can use as an alternative to ordinary least squares regression when there are outliers or influential observations in the dataset we’re working with.

To perform robust regression in R, we can use the rlm() function from the MASS package, which uses the following syntax:

#fit robust regression model
robustModel <- rlm(tot_expenses ~tot_income, data=HH_data)

The residual standard error (RSE) is a way to measure the standard deviation of the residuals in a regression model. The lower the value for RSE, the more closely a model is able to fit the data.

method Residual_standard_erro Adjusted_R_squared
Original model 1996.3692 0.3474183
Remove Outliers Using Boxplot 694.0943 0.6696293
Remove Outliers Using Z-score Method 741.2360 0.6426958
Quantile based flooring and capping 617.4033 0.7195258
Robust Regression 489.6966 NA

We can see that the RSE for the robust regression model is much lower than the ordinary least squares regression model, which tells us that the robust regression model offers a better fit to the data.

Weighted Least Squares Regression

One of the key assumptions of linear regression is that the residuals are distributed with equal variance at each level of the predictor variable. This assumption is known as homoscedasticity.

When this assumption is violated, we say that heteroscedasticity is present in the residuals. When this occurs, the results of the regression become unreliable.

One way to handle this issue is to instead use weighted least squares regression, which places weights on the observations such that those with small error variance are given more weight since they contain more information compared to observations with larger error variance.

Test for Heteroscedasticity

create a residual vs. fitted values plot to visually check for heteroscedasticity:

#create residual vs. fitted plot
plot(fitted(Model1), resid(Model1), xlab='Fitted Values', ylab='Residuals')

#add a horizontal line at 0 
abline(0,0)

We can see from the plot that the residuals not distributed with equal variance throughout the plot.

To formally test for heteroscedasticity, we can perform a Breusch-Pagan test:

The Breusch-Pagan test uses the following null and alternative hypotheses: Null Hypothesis (H0): Homoscedasticity is present (the residuals are distributed with equal variance)

#perform Breusch-Pagan test
bptest(Model1)
## 
##  studentized Breusch-Pagan test
## 
## data:  Model1
## BP = 44.84, df = 1, p-value = 2.138e-11

Since the p-value from the test is less than 0.05 we will reject the null hypothesis and conclude that heteroscedasticity is a problem in this model.

Perform Weighted Least Squares Regression

Since heteroscedasticity is present, we will perform weighted least squares by defining the weights in such a way that the observations with lower variance are given more weight:

#define weights to use
wt <- 1 / lm(abs(Model1$residuals) ~ Model1$fitted.values)$fitted.values^2

#perform weighted least squares regression
wls_model <- lm(tot_expenses ~tot_income, data = HH_data, weights=wt)
method Residual_standard_erro Adjusted_R_squared
Original model 1996.369175 0.3474183
Remove Outliers Using Boxplot 694.094265 0.6696293
Remove Outliers Using Z-score Method 741.236030 0.6426958
Quantile based flooring and capping 617.403348 0.7195258
Robust Regression 489.696636 NA
Weighted Least Squares Regression 2.255465 0.3601616

The weighted least squares model has a residual standard error of 2.255465 compared to 1996.369175 in the original simple linear regression model.

This indicates that the predicted values produced by the weighted least squares model are much closer to the actual observations compared to the predicted values produced by the simple linear regression model.