## tot_income tot_expenses
## Min. : 24 Min. : 120
## 1st Qu.: 600 1st Qu.: 900
## Median : 1500 Median : 1500
## Mean : 1995 Mean : 2088
## 3rd Qu.: 3000 3rd Qu.: 3000
## Max. :25000 Max. :100000
## [1] 6989 2
Using R base (with the number of bins corresponding to the square root of the number of observations in order to have more bins than the default option):
p1<- HH_data %>% ggplot( aes(x = tot_income) ) +
geom_histogram(bins = sqrt(7514), fill = "#FF6666") +
theme_minimal()
p2 <- HH_data %>% ggplot( aes(x = tot_expenses) ) +
geom_histogram(bins = sqrt(7514), fill = "#FF6666") +
scale_y_continuous(trans = "log2")+
theme_minimal()
gridExtra::grid.arrange(p1, p2, nrow = 1)
p1 <- ggplot(HH_data) +
aes(x = "", y = tot_income) +
geom_boxplot(fill = "#0c4c8a") +
theme_minimal()
p2 <- ggplot(HH_data) +
aes(x = "", y = tot_expenses) +
geom_boxplot(fill = "#0c4c8a") +
theme_minimal()
gridExtra::grid.arrange(p1, p2, nrow = 1)
p1 <- HH_data %>%
ggplot( aes(x= tot_income, y = tot_expenses )) +
geom_point(color='#FF6666' ) +
geom_smooth(method = "lm", se = FALSE, lty = "dashed") +
ggtitle("Total Income and Total Expenses")
p1
## `geom_smooth()` using formula 'y ~ x'
p2 <- HH_data %>%
ggplot( aes(x= tot_income, y = tot_expenses )) +
geom_point(alpha = .2) +
geom_density2d()
p3 <- HH_data %>%
ggplot( aes(x= tot_income, y = tot_expenses )) +
geom_hex(bins = 50, show.legend = FALSE)
gridExtra::grid.arrange( p2, p3, nrow = 1)
p1 <- HH_data %>%
ggplot( aes(x= tot_income, y = tot_expenses )) +
geom_point(color='#FF6666',alpha = .3) +
geom_smooth(method = "lm", se = FALSE, color = "red", lty = "dashed") +
ggtitle("Non-transformed variables")
p2 <- HH_data %>%
ggplot( aes(x= tot_income, y = tot_expenses )) +
geom_point(alpha = .3,color='#FF6666') +
geom_smooth(method = "lm", se = FALSE, color = "red", lty = "dashed") +
scale_x_log10() +
scale_y_log10() +
ggtitle("log-transformed variables")
gridExtra::grid.arrange(p1, p2, nrow = 1)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
Model1<-lm(HH_data$tot_expenses ~HH_data$tot_income)
| method | Residual_standard_erro | Adjusted_R_squared |
|---|---|---|
| Original model | 1996.369 | 0.3474183 |
. An outlier is a value or an observation that is distant from other observations
. A data point that differs significantly from other data points.
outliers <- boxplot(HH_data$tot_income, plot=FALSE)$out
dataWithoutOutlier <- HH_data[-which(HH_data$tot_income %in% outliers),]
outliers2 <- boxplot(dataWithoutOutlier$tot_expenses, plot=FALSE)$out
dataWithoutOutlier <- dataWithoutOutlier[-which(dataWithoutOutlier$tot_expenses %in% outliers2),]
#Model > After remove outliers Using boxplot
dim(dataWithoutOutlier)
## [1] 6674 2
Model_box_blot<-lm(data = dataWithoutOutlier , formula=tot_expenses ~tot_income)
| method | Residual_standard_erro | Adjusted_R_squared |
|---|---|---|
| Original model | 1996.3692 | 0.3474183 |
| Remove Outliers Using Boxplot | 694.0943 | 0.6696293 |
df <-data.frame(HH_data$tot_income, HH_data$tot_expenses)
z_scores <- as.data.frame(sapply(df, function(df) (abs(df-mean(df))/sd(df))))
no_outliers2 <- df[!rowSums(z_scores>2), ]
dim(no_outliers2)
## [1] 6620 2
Model_z_score <-lm(no_outliers2$HH_data.tot_expenses ~ no_outliers2$HH_data.tot_income)
| method | Residual_standard_erro | Adjusted_R_squared |
|---|---|---|
| Original model | 1996.3692 | 0.3474183 |
| Remove Outliers Using Boxplot | 694.0943 | 0.6696293 |
| Remove Outliers Using Z-score Method | 741.2360 | 0.6426958 |
In this technique, the outlier is capped at a certain value above the 90th percentile value or floored at a factor below the 10th percentile value.
# Computing 10th, 90th percentiles and replacing the outliers for tot_income
tenth_percentile = quantile(HH_data$tot_income , c(.1))
ninetieth_percentile = quantile(HH_data$tot_income , c(.9))
# Computing 10th, 90th percentiles and replacing the outliers for tot_income
tenth_percentile_tot_expenses = quantile(HH_data$tot_expenses , c(.1))
ninetieth_percentile_tot_expenses = quantile(HH_data$tot_expenses , c(.9))
flooringCappingData <- HH_data %>%
mutate(tot_income = case_when(
tot_income < tenth_percentile ~ tenth_percentile ,
tot_income > ninetieth_percentile ~ ninetieth_percentile ,
TRUE ~ tot_income
) ,
tot_expenses = case_when(
tot_expenses < tenth_percentile_tot_expenses ~ tenth_percentile_tot_expenses ,
tot_expenses > ninetieth_percentile_tot_expenses ~ ninetieth_percentile_tot_expenses ,
TRUE ~ tot_expenses
)
)
plot Data
p1 <- ggplot(flooringCappingData) +
aes(x = "", y = tot_income) +
geom_boxplot(fill = "#0c4c8a") +
theme_minimal()
p2 <- ggplot(flooringCappingData) +
aes(x = "", y = tot_expenses) +
geom_boxplot(fill = "#0c4c8a") +
theme_minimal()
gridExtra::grid.arrange(p1, p2, nrow = 1)
flooringCappingModel <- lm(tot_expenses ~tot_income, data = flooringCappingData )
| method | Residual_standard_erro | Adjusted_R_squared |
|---|---|---|
| Original model | 1996.3692 | 0.3474183 |
| Remove Outliers Using Boxplot | 694.0943 | 0.6696293 |
| Remove Outliers Using Z-score Method | 741.2360 | 0.6426958 |
| Quantile based flooring and capping | 617.4033 | 0.7195258 |
Robust regression is a method we can use as an alternative to ordinary least squares regression when there are outliers or influential observations in the dataset we’re working with.
To perform robust regression in R, we can use the rlm() function from the MASS package, which uses the following syntax:
#fit robust regression model
robustModel <- rlm(tot_expenses ~tot_income, data=HH_data)
The residual standard error (RSE) is a way to measure the standard deviation of the residuals in a regression model. The lower the value for RSE, the more closely a model is able to fit the data.
| method | Residual_standard_erro | Adjusted_R_squared |
|---|---|---|
| Original model | 1996.3692 | 0.3474183 |
| Remove Outliers Using Boxplot | 694.0943 | 0.6696293 |
| Remove Outliers Using Z-score Method | 741.2360 | 0.6426958 |
| Quantile based flooring and capping | 617.4033 | 0.7195258 |
| Robust Regression | 489.6966 | NA |
We can see that the RSE for the robust regression model is much lower than the ordinary least squares regression model, which tells us that the robust regression model offers a better fit to the data.
One of the key assumptions of linear regression is that the residuals are distributed with equal variance at each level of the predictor variable. This assumption is known as homoscedasticity.
When this assumption is violated, we say that heteroscedasticity is present in the residuals. When this occurs, the results of the regression become unreliable.
One way to handle this issue is to instead use weighted least squares regression, which places weights on the observations such that those with small error variance are given more weight since they contain more information compared to observations with larger error variance.
create a residual vs. fitted values plot to visually check for heteroscedasticity:
#create residual vs. fitted plot
plot(fitted(Model1), resid(Model1), xlab='Fitted Values', ylab='Residuals')
#add a horizontal line at 0
abline(0,0)
We can see from the plot that the residuals not distributed with equal variance throughout the plot.
To formally test for heteroscedasticity, we can perform a Breusch-Pagan test:
The Breusch-Pagan test uses the following null and alternative hypotheses: Null Hypothesis (H0): Homoscedasticity is present (the residuals are distributed with equal variance)
#perform Breusch-Pagan test
bptest(Model1)
##
## studentized Breusch-Pagan test
##
## data: Model1
## BP = 44.84, df = 1, p-value = 2.138e-11
Since the p-value from the test is less than 0.05 we will reject the null hypothesis and conclude that heteroscedasticity is a problem in this model.
Since heteroscedasticity is present, we will perform weighted least squares by defining the weights in such a way that the observations with lower variance are given more weight:
#define weights to use
wt <- 1 / lm(abs(Model1$residuals) ~ Model1$fitted.values)$fitted.values^2
#perform weighted least squares regression
wls_model <- lm(tot_expenses ~tot_income, data = HH_data, weights=wt)
| method | Residual_standard_erro | Adjusted_R_squared |
|---|---|---|
| Original model | 1996.369175 | 0.3474183 |
| Remove Outliers Using Boxplot | 694.094265 | 0.6696293 |
| Remove Outliers Using Z-score Method | 741.236030 | 0.6426958 |
| Quantile based flooring and capping | 617.403348 | 0.7195258 |
| Robust Regression | 489.696636 | NA |
| Weighted Least Squares Regression | 2.255465 | 0.3601616 |
The weighted least squares model has a residual standard error of 2.255465 compared to 1996.369175 in the original simple linear regression model.
This indicates that the predicted values produced by the weighted least squares model are much closer to the actual observations compared to the predicted values produced by the simple linear regression model.