Fish Market
# load semua library yang akan digunakan
library(dplyr) # data transformation
library(ggplot2)
library(plotly)
library(GGally) # EDA, cek korelasi
library(performance) # perbandingan model
library(MLmetrics) # hitung nilai error
library(lmtest) # cek asumsi homoscedasticity
library(car) # cek asumsi no multicollinearity
#Theme Visualitation
theme_algoritma <- theme(legend.key = element_rect(fill="black"),
legend.background = element_rect(color="white", fill="#263238"),
plot.subtitle = element_text(size=6, color="white"),
panel.background = element_rect(fill="#dddddd"),
panel.border = element_rect(fill=NA),
panel.grid.minor.x = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_line(color="darkgrey", linetype=2),
panel.grid.minor.y = element_blank(),
plot.background = element_rect(fill="#263238"),
text = element_text(color="white"),
axis.text = element_text(color="white")
)fish <- read.csv("data_input/Fish.csv")
head(fish)Description of variables:
Species: species name of fishWeight: weight of fish in Gram (Gr)Length 1: vertical length in Centimeter (Cm)Length 2: diagonal length in Centimeter (Cm)Length 3: cross length in Centimeter (Cm)Height: height in Centimeter (Cm)Width: diagonal width in Centimeter (Cm)❓ Define target and predictor variables
WeightSpecies, Length 1,
Length 2, Length 3, Height,
Widthsummary(fish)#> Species Weight Length1 Length2
#> Length:159 Min. : 0.0 Min. : 7.50 Min. : 8.40
#> Class :character 1st Qu.: 120.0 1st Qu.:19.05 1st Qu.:21.00
#> Mode :character Median : 273.0 Median :25.20 Median :27.30
#> Mean : 398.3 Mean :26.25 Mean :28.42
#> 3rd Qu.: 650.0 3rd Qu.:32.70 3rd Qu.:35.50
#> Max. :1650.0 Max. :59.00 Max. :63.40
#> Length3 Height Width
#> Min. : 8.80 Min. : 1.728 Min. :1.048
#> 1st Qu.:23.15 1st Qu.: 5.945 1st Qu.:3.386
#> Median :29.40 Median : 7.786 Median :4.248
#> Mean :31.23 Mean : 8.971 Mean :4.417
#> 3rd Qu.:39.65 3rd Qu.:12.366 3rd Qu.:5.585
#> Max. :68.00 Max. :18.957 Max. :8.142
Insight :
fish %>%
select(Species,Weight) %>%
filter(Weight %in% 1650)Check the data structure of fish, all columns should
have the same data type because they have used the
stringsAsFactors = T parameter when reading data.
# check structure of data-set
glimpse(fish)#> Rows: 159
#> Columns: 7
#> $ Species <chr> "Bream", "Bream", "Bream", "Bream", "Bream", "Bream", "Bream",…
#> $ Weight <dbl> 242, 290, 340, 363, 430, 450, 500, 390, 450, 500, 475, 500, 50…
#> $ Length1 <dbl> 23.2, 24.0, 23.9, 26.3, 26.5, 26.8, 26.8, 27.6, 27.6, 28.5, 28…
#> $ Length2 <dbl> 25.4, 26.3, 26.5, 29.0, 29.0, 29.7, 29.7, 30.0, 30.0, 30.7, 31…
#> $ Length3 <dbl> 30.0, 31.2, 31.1, 33.5, 34.0, 34.7, 34.5, 35.0, 35.1, 36.2, 36…
#> $ Height <dbl> 11.5200, 12.4800, 12.3778, 12.7300, 12.4440, 13.6024, 14.1795,…
#> $ Width <dbl> 4.0200, 4.3056, 4.6961, 4.4555, 5.1340, 4.9274, 5.2785, 4.6900…
Insight:
#house <- house %>%
fish <- fish %>%
mutate(Species= as.factor(Species))
glimpse(fish)#> Rows: 159
#> Columns: 7
#> $ Species <fct> Bream, Bream, Bream, Bream, Bream, Bream, Bream, Bream, Bream,…
#> $ Weight <dbl> 242, 290, 340, 363, 430, 450, 500, 390, 450, 500, 475, 500, 50…
#> $ Length1 <dbl> 23.2, 24.0, 23.9, 26.3, 26.5, 26.8, 26.8, 27.6, 27.6, 28.5, 28…
#> $ Length2 <dbl> 25.4, 26.3, 26.5, 29.0, 29.0, 29.7, 29.7, 30.0, 30.0, 30.7, 31…
#> $ Length3 <dbl> 30.0, 31.2, 31.1, 33.5, 34.0, 34.7, 34.5, 35.0, 35.1, 36.2, 36…
#> $ Height <dbl> 11.5200, 12.4800, 12.3778, 12.7300, 12.4440, 13.6024, 14.1795,…
#> $ Width <dbl> 4.0200, 4.3056, 4.6961, 4.4555, 5.1340, 4.9274, 5.2785, 4.6900…
# check missing values
colSums(is.na(fish))#> Species Weight Length1 Length2 Length3 Height Width
#> 0 0 0 0 0 0 0
Insight:
❓ Is there any outlier values in variable target ?
# Check out-lier in all species
boxplot1 <- fish %>%
ggplot(data=fish ,mapping=aes(y=Weight)) +
geom_boxplot() + theme_algoritma
ggplotly(boxplot1) Insight
# Check out-lier in all species
boxplot1 <- fish %>%
ggplot(data=fish ,mapping=aes(x= Species, y=Weight)) +
geom_boxplot(aes(fill=Species)) + theme(legend.position = "none") + theme_algoritma
ggplotly(boxplot1) Insight
Check data correlation using ggcorr(). How does the
numeric predictor relate to the target variable?
ggcorr(fish, label = T)
Insight
Before we go to model fitting, we need to split the data into train data-set and test data-set. We are going to train our linear regression using data train, and data test will serve as a comparison to see if the model becomes overfit and is unable to predict new data that was not seen during the training phase. We will use 70% of the data for training and the remaining 30% for testing.
set.seed(123)
samplesize <- round(0.7 * nrow(fish), 0)
index <- sample(seq_len(nrow(fish)), size = samplesize)
fish_train <- fish[index, ]
fish_test <- fish[-index, ]In this section we will use the lm() function to create
three models for comparison:
model_none: Model without predictorsmodel_all: Model with all predictorsmodel_step: The model generated from the step-wise
regression processCreate a model without using any predictors. In other words, the coefficient is only intercept.
# model without predictor
model_none <- lm(formula = Weight ~ 1.,data = fish_train)
summary(model_none)#>
#> Call:
#> lm(formula = Weight ~ 1, data = fish_train)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -381.1 -261.1 -121.1 206.4 1268.9
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 381.13 35.05 10.87 <0.0000000000000002 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 369.3 on 110 degrees of freedom
insight :
mean(fish_train$Weight)#> [1] 381.1333
Create a model using all the columns in the fish
data.
# model with all predictors
model_all <- lm(formula = Weight ~ .,data = fish_train)
summary(model_all)#>
#> Call:
#> lm(formula = Weight ~ ., data = fish_train)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -211.886 -50.257 -7.881 41.790 262.806
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -1032.862 146.280 -7.061 0.000000000231 ***
#> SpeciesParkki 222.953 86.362 2.582 0.011299 *
#> SpeciesPerch 173.486 137.653 1.260 0.210519
#> SpeciesPike -227.112 150.867 -1.505 0.135411
#> SpeciesRoach 160.493 104.245 1.540 0.126855
#> SpeciesSmelt 513.429 136.864 3.751 0.000296 ***
#> SpeciesWhitefish 130.712 119.850 1.091 0.278080
#> Length1 -109.779 40.780 -2.692 0.008340 **
#> Length2 108.833 51.035 2.133 0.035437 *
#> Length3 37.588 31.906 1.178 0.241591
#> Height 8.592 14.250 0.603 0.547946
#> Width -36.541 28.773 -1.270 0.207064
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 86.87 on 99 degrees of freedom
#> Multiple R-squared: 0.9502, Adjusted R-squared: 0.9447
#> F-statistic: 171.7 on 11 and 99 DF, p-value: < 0.00000000000000022
model step-wise regression, make models with any step (forward,backward,both) compare with each of AIC
model_stepF <- step(object = model_none , direction = "forward",scope = list(upper = model_all),trace = F)
model_stepB <- step(object = model_all , direction = "backward",trace = F)
model_stepBO <- step(object = model_none , direction = "both",scope = list(upper = model_all),trace = F)comparison <- compare_performance(model_none,model_all,model_stepF,model_stepB,model_stepBO)
data.frame(comparison)Insight > The best model among the five models is the step model backward, due to the lowest AIC > AIC is the amount of missing information in the model.
summary(model_stepB)#>
#> Call:
#> lm(formula = Weight ~ Species + Length1 + Length2, data = fish_train)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -219.988 -56.328 -4.169 43.784 278.505
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -854.412 60.083 -14.221 < 0.0000000000000002 ***
#> SpeciesParkki 124.383 43.196 2.879 0.004854 **
#> SpeciesPerch -5.606 28.784 -0.195 0.845958
#> SpeciesPike -337.398 40.827 -8.264 0.000000000000545 ***
#> SpeciesRoach 41.676 40.332 1.033 0.303892
#> SpeciesSmelt 364.916 56.027 6.513 0.000000002821559 ***
#> SpeciesWhitefish -4.873 64.656 -0.075 0.940064
#> Length1 -109.804 40.498 -2.711 0.007865 **
#> Length2 144.968 38.084 3.807 0.000241 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 86.76 on 102 degrees of freedom
#> Multiple R-squared: 0.9488, Adjusted R-squared: 0.9448
#> F-statistic: 236.3 on 8 and 102 DF, p-value: < 0.00000000000000022
Interpretation :
Linearity means that the target variable and its predictors have a linear relationship or a straight line relationship.
To determine the assumption of linearity of a regression model (multiple linear regression) can be done by making a plot of residuals vs fitted values. This plot is a scatter plot with the x axis being the fitted values (predicted results of the target variable) and the y axis being the residual/error values generated by the model.
lin <- data.frame(residual = model_stepB$residuals , fitted = model_stepB$fitted.values)
lin <- lin %>%
ggplot(aes(fitted, residual)) + geom_point() + geom_smooth() + geom_hline(aes(yintercept = 0)) +
theme(panel.grid = element_blank(), panel.background = element_blank()) + theme_algoritma
ggplotly(lin)Insight :
The standard errors that are calculated for the estimated regression coefficients or the fitted values are based on the assumption that the error terms are not correlated (i.e., no autocorrelation). However, if there is actually correlation among the error terms, then the estimated standard errors will tend to underestimate the true standard errors. Consequently, confidence and prediction intervals will be narrower than they should be. For instance, a 95% confidence interval may have a much lower probability than 0.95 of containing the actual parameter value. Furthermore, p-values related to the model may be lower than they should be, which could lead us to mistakenly conclude that a parameter is statistically significant. In summary, if the error terms are correlated, we may have an unjustified sense of confidence in our model.
durbinWatsonTest(model_stepB)#> lag Autocorrelation D-W Statistic p-value
#> 1 0.2163663 1.475004 0.006
#> Alternative hypothesis: rho != 0
Insight :
The Normality of Residuals is a crucial assumption in linear regression analysis, as it indicates that the distribution of errors is approximately Normal with a mean of zero and a constant variance. If the residuals are not Normally distributed, this could imply that the model is misspecified, and the regression estimates may be biased or inefficient. Therefore, it is essential to assess the Normality of Residuals by examining a Normal Probability Plot, a histogram, or conducting a formal statistical test such as the Shapiro-Wilk test. If the Normality assumption is violated, transformations of variables or the use of robust regression techniques may be necessary.
hist <- data.frame(residu = model_stepB$residuals)
hist <- hist %>%
ggplot(hist, mapping=aes(x = residu)) +
geom_histogram(fill = "navy", color = "black") +
labs(title = "Normality of Residual", x = "Residu", y = "Frequency", subtitle = "Histogram", caption ="the data looks not normally distributed, should checked by shapiro.test()") + theme_algoritma
ggplotly(hist)shapiro.test(model_stepB$residuals)#>
#> Shapiro-Wilk normality test
#>
#> data: model_stepB$residuals
#> W = 0.97294, p-value = 0.02339
Insight :
It is expected that the errors generated by the model spread randomly or with constant variation. When visualized, the error is not patterned. This condition is also known as homoscedasticity
fitted.values vs
residualsfitted.values = values result of data train’s
predictresiduals = error valuesscp1 <- data.frame(fitted.values = model_stepB$fitted.values , residuals=model_stepB$residuals)
scp1 <- scp1 %>%
ggplot()+
geom_jitter(mapping = aes(x=fitted.values,y=residuals))+
geom_hline(yintercept=0,color="red")+
labs(title="Homoscedasticity of Residuals", y = "Residu", x="Fitted Values", caption ="For detail information p-value should be checked by bptest()")+
theme_algoritma
ggplotly(scp1)bptest(model_stepB)#>
#> studentized Breusch-Pagan test
#>
#> data: model_stepB
#> BP = 8.587, df = 8, p-value = 0.3783
Insight :
Multicollinearity is a problem that occurs when two or more independent variables in a regression model are highly correlated with each other. This can lead to unstable and unreliable regression estimates, as the model cannot accurately distinguish the individual effects of each variable on the dependent variable. Variance Inflation Factor (VIF) is a metric used to measure the severity of multicollinearity among independent variables in a regression model, Generally, a VIF value above 5 or 10 is considered a warning sign of severe multicollinearity that may require remedial action.
vif(model_stepB)#> GVIF Df GVIF^(1/(2*Df))
#> Species 7.747044 6 1.186027
#> Length1 2545.128466 1 50.449266
#> Length2 2601.408189 1 51.004002
Insight :
Length 1 and Length 2,
then multicollinearity occurs in the model. The assumption is not
met.Length 1 and Length 2 in model
tuningUse the best model to predict new data. In the current condition, the fish_test file is available as data that has never been “seen” by the model. This data is also known as testing data.
fish_test$pred_weight <- predict(
object = model_stepB,
newdata = fish_test)
# lihat 6 data pertama
head(fish_test)# RMSE data train
RMSE(y_pred = model_stepB$fitted.values, # the best object model
y_true = fish_train$Weight)#> [1] 83.17172
# RMSE data test
RMSE(y_pred = fish_test$pred_weight,
y_true = fish_test$Weight)#> [1] 112.2435
# sebaran data asli (sebagai acuan saja)
range(fish_test$Weight)#> [1] 5.9 1100.0
# standard deviation
sd(fish_test$Weight)#> [1] 330.6775
Insight :
#create square root of target variable
#create a new predictor which is the average of the two predictors between `Length 1` and `Length 2`
fish_train_tune <- fish_train %>%
mutate(sqrt_weight = sqrt(Weight)) %>%
mutate(average_L1_L2 = rowMeans(select(fish_train,Length1,Length2)))
#Evidence sqrt change value distribution
boxplot1 <- fish %>%
ggplot(data=fish ,mapping=aes(y=Weight)) +
geom_boxplot() + labs(title = "Weight (before transformation)") + theme_algoritma
ggplotly(boxplot1) #Evidence sqrt change value distribution
boxplot2 <- fish_train_tune %>%
ggplot(data=fish_train_tune ,mapping=aes(y= sqrt_weight)) +
geom_boxplot() + labs(title = "Weight (after transformation)") + theme_algoritma
ggplotly(boxplot2) insight :
# make a model using sqrt_weight and cleansing data between length 1&2 due to multicollinearity
model_fish_tune <- lm(
formula = sqrt_weight ~ Species + average_L1_L2 , data = fish_train_tune)
# cek summary model
summary(model_fish_tune)#>
#> Call:
#> lm(formula = sqrt_weight ~ Species + average_L1_L2, data = fish_train_tune)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -9.7484 -0.5965 0.0680 0.7544 3.6137
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -6.06123 0.86836 -6.980 0.000000000295 ***
#> SpeciesParkki -0.78258 0.69045 -1.133 0.2597
#> SpeciesPerch -2.68504 0.43784 -6.132 0.000000016182 ***
#> SpeciesPike -10.90889 0.61701 -17.680 < 0.0000000000000002 ***
#> SpeciesRoach -3.27944 0.59155 -5.544 0.000000229426 ***
#> SpeciesSmelt -1.96145 0.75263 -2.606 0.0105 *
#> SpeciesWhitefish -0.94029 1.14299 -0.823 0.4126
#> average_L1_L2 0.96653 0.02492 38.786 < 0.0000000000000002 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 1.552 on 103 degrees of freedom
#> Multiple R-squared: 0.9744, Adjusted R-squared: 0.9727
#> F-statistic: 560 on 7 and 103 DF, p-value: < 0.00000000000000022
IMPORTANT: be careful when interpreting, because the coefficient value increases/decreases the sqrt_charges value, not the charges value directly.
lin1 <- data.frame(fitted.values = model_fish_tune$fitted.values, residuals = model_fish_tune$residuals)
lin1 <- lin1 %>%
ggplot() +
geom_jitter(mapping = aes(fitted.values, residuals)) +
geom_smooth(mapping = aes(fitted.values, residuals)) +
geom_hline(yintercept = 0, color = "red") +
theme_algoritma
ggplotly(lin1)insight :
durbinWatsonTest(model_fish_tune)#> lag Autocorrelation D-W Statistic p-value
#> 1 -0.00005553873 1.962148 0.796
#> Alternative hypothesis: rho != 0
insight :
model_stepB, p-value using durbinwatson test
below 0.05, after do model improvement in average Length 1
, Length 2 assign into average_L1_L2, p-value
above 0.05 (0.766) it means that autocorrelation is not present.hist1 <- data.frame(residu = model_fish_tune$residuals)
hist1<- hist1 %>%
ggplot(hist, mapping = aes(x = residu)) +
geom_histogram(fill = "navy", color = "black") +
labs(title = "Normality of Residual", x = "Residu", y = "Frequency", subtitle = "Histogram", caption ="the data looks not normally distributed, should checked by check_normality()") + theme_algoritma
ggplotly(hist1)check_normality(model_fish_tune)#> Warning: Non-normality of residuals detected (p < .001).
It is expected that the errors generated by the model spread randomly or with constant variation. When visualized, the error is not patterned. This condition is also known as homoscedasticity
fitted.values vs
residualsfitted.values = values result of data train’s
predictresiduals = error valuesscp <- data.frame(fitted.values = model_fish_tune$fitted.values , residuals=model_fish_tune$residuals)
scp <- scp %>%
ggplot()+
geom_jitter(mapping = aes(x=fitted.values,y=residuals))+
geom_hline(yintercept=0,color="red")+
labs(title="Homoscedasticity of Residuals", y = "Residu", x="Fitted Values", caption ="For detail information p-value should be checked by bptest()")+
theme_algoritma
ggplotly(scp)bptest(model_fish_tune)#>
#> studentized Breusch-Pagan test
#>
#> data: model_fish_tune
#> BP = 6.5889, df = 7, p-value = 0.4729
check_collinearity(model_fish_tune)Insight :
model_stepB, using test multicollinearity
VIF’s value is above 10, after do model improvement in average
Length 1 , Length 2 assign into
average_L1_L2, VIF below 10 it means that Multicollinearity
is not present.When do a prediction using model model_fish_tune ,
should remember about the result of prediction is sqrt_weight, we are
going to back a real values using inverse function which is squaring
it
# Prediction train
pred_sqrt_weight <- predict(
object = model_fish_tune,
newdata = fish_train_tune,interval = "confidence", level = 0.95)
# Make a new data set of test fish_test_tune, sqrt() and mean of length 1 and length 2
fish_test_tune <- fish_test %>%
mutate(sqrt_weight = sqrt(Weight)) %>%
mutate(average_L1_L2 = rowMeans(select(fish_test,Length1,Length2)))
# Prediction test
pred_sqrt_weight_test <- predict(
object = model_fish_tune,
newdata = fish_test_tune,interval = "confidence", level = 0.95)
# inverse data train sqrt
fish_train_tune <- fish_train_tune %>%
mutate(pred_weight = pred_sqrt_weight[,1] ^ 2) %>%
mutate(upr = pred_sqrt_weight[,2] ^ 2) %>%
mutate(lwr = pred_sqrt_weight[,3] ^ 2)
# inverse data test sqrt
fish_test_tune <- fish_test_tune %>%
mutate(pred_weight = pred_sqrt_weight_test[,1] ^ 2) %>%
mutate(upr = pred_sqrt_weight_test[,2] ^ 2) %>%
mutate(lwr = pred_sqrt_weight_test[,3] ^ 2)# Regression line + confidence intervals
p <- ggplot(fish_test_tune, aes(Weight, pred_weight)) +
geom_point() +
stat_smooth(method = lm) + theme_algoritma
ggplotly(p)Check RMSE
# Data Train
RMSE(y_pred = fish_train_tune$pred_weight,
y_true = fish_train_tune$Weight)#> [1] 53.02689
# Data Test
RMSE(y_pred = fish_test_tune$pred_weight,
y_true = fish_test_tune$Weight)#> [1] 68.67018
Range data and Standard Deviation model tune
range(fish_test_tune$Weight)#> [1] 5.9 1100.0
sd(fish_test_tune$Weight)#> [1] 330.6775
Formula
model_stepB = Weight ~ Species + Length1 + Length2model_fish_tune = Weight ~ Species + average_L1_L2Conclusion :
lengths 1 and
lenght 2 and VIF <10 occurred.sqrt() but we can use log10 and delete
the outlier, for more detail information please check link below. Transform
Data