In this report we will look at a cleaned nutrition dataset with approximately 9000 different food items. This data was collected from a nutrition database that originally contained several million rows. Each column contains the different nutrition components to a given food. In this report will be looking at how these different nutritional parts of the food impact the calories in a given food item. We will fit, interpret, and evaluate a regression model improving it as we uncover additional diagnostic information to produce the best possible model.
The first step I took after loading the dataset was to sample a 1000 rows for the purpose of the analysis. I then selected the columns of interest to test their impact on our dependent variable calories. All selected columns were in numeric form. I tested for the normality of the response variable and found it was right skewed. Testing the normality with a qq-plot it could be seen a large amount of the data points didn’t fall on the line. This was corrected by transforming the response variable with a square root so the data followed a normal distribution. I generated a variety of descriptive statistics to get a sense for the data and where outliers may exist. Using several plots it could be seen outliers were present in the different variables. Using the z-score method I removed present outliers and re-tested the response for normality and saw the data fit the line better than previously.
# Part 1
nutrition <- read.csv("Final_Nutrition.csv")
colnames(nutrition)
## [1] "NDB_No" "Shrt_Desc" "Long_Desc" "FdGrp_Desc"
## [5] "Water_g" "Energ_Kcal" "Protein_g" "Lipid_Tot_g"
## [9] "Carbohydrt_g" "Fiber_TD_g" "Sugar_Tot_g" "Calcium_mg"
## [13] "Iron_mg" "Magnesium_mg" "Phosphorus_mg" "Potassium_mg"
## [17] "Sodium_mg" "Zinc_mg" "Copper_mg" "Manganese_mg"
## [21] "Selenium_g" "Vit_C_mg" "Thiamin_mg" "Riboflavin_mg"
## [25] "Niacin_mg" "Vit_B6_mg" "Folate_Tot_g" "Folic_Acid_g"
## [29] "Food_Folate_g" "Choline_Tot__mg" "Vit_B12_g" "Lycopene_g"
## [33] "Vit_E_mg" "Vit_D_g" "Vit_K_g" "FA_Sat_g"
## [37] "FA_Mono_g" "FA_Poly_g" "Cholestrl_mg" "GmWt_1"
## [41] "GmWt_Desc1" "GmWt_2" "GmWt_Desc2"
dim(nutrition)
## [1] 8789 43
# ----------------------------------------
# Randomly sample 1000 rows from nutrition
# ----------------------------------------
nutrition <- dplyr::sample_n(nutrition, 1000)
# Part 2 - EDA and descriptive Statistics
# ----------------------------------------
# Keeping only a handful of columns for illustration
# ----------------------------------------
nutrition2 <- nutrition |> select(Energ_Kcal, Lipid_Tot_g,Carbohydrt_g,Fiber_TD_g,Calcium_mg,Iron_mg,Magnesium_mg,Phosphorus_mg,Potassium_mg, Copper_mg, Manganese_mg, Selenium_g, Vit_C_mg, Thiamin_mg, Riboflavin_mg)
dim(nutrition2)
## [1] 1000 15
# Create histogram
hist(nutrition2$Energ_Kcal)
# Plot QQ plot
qqnorm(nutrition2$Energ_Kcal)
qqline(nutrition2$Energ_Kcal)
# ----------------------------------------
# Convert calories to square root
# ----------------------------------------
nutrition2$Energ_Kcal_sqrt <- sqrt(nutrition2$Energ_Kcal)
# Create histogram
hist(nutrition2$Energ_Kcal_sqrt)
# Plot QQ plot
qqnorm(nutrition2$Energ_Kcal_sqrt)
qqline(nutrition2$Energ_Kcal_sqrt)
# Additional EDA
plot_missing(nutrition2)
plot_histogram(nutrition2)
plot_boxplot(nutrition2, by="Energ_Kcal_sqrt")
## Warning: Removed 819 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 199 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
plot_density(nutrition2)
# Descriptive Stats
summary(nutrition2)
## Energ_Kcal Lipid_Tot_g Carbohydrt_g Fiber_TD_g
## Min. : 0.00 Min. : 0.0000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 85.75 1st Qu.: 0.8175 1st Qu.: 0.000 1st Qu.: 0.000
## Median :200.00 Median : 5.0000 Median : 7.585 Median : 0.400
## Mean :228.83 Mean : 11.1410 Mean : 20.442 Mean : 2.178
## 3rd Qu.:341.25 3rd Qu.: 14.6500 3rd Qu.: 29.317 3rd Qu.: 2.400
## Max. :902.00 Max. :100.0000 Max. :100.000 Max. :42.800
## NA's :71
## Calcium_mg Iron_mg Magnesium_mg Phosphorus_mg
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.0
## 1st Qu.: 9.00 1st Qu.: 0.47 1st Qu.: 12.00 1st Qu.: 49.0
## Median : 21.00 Median : 1.33 Median : 21.00 Median : 160.0
## Mean : 78.27 Mean : 2.84 Mean : 38.84 Mean : 172.9
## 3rd Qu.: 70.00 3rd Qu.: 2.60 3rd Qu.: 31.00 3rd Qu.: 229.0
## Max. :2240.00 Max. :89.80 Max. :781.00 Max. :1677.0
## NA's :34 NA's :14 NA's :74 NA's :61
## Potassium_mg Copper_mg Manganese_mg Selenium_g
## Min. : 0.0 Min. : 0.0000 Min. : 0.0000 Min. : 0.00
## 1st Qu.: 131.0 1st Qu.: 0.0500 1st Qu.: 0.0140 1st Qu.: 1.40
## Median : 236.0 Median : 0.0880 Median : 0.0620 Median : 10.60
## Mean : 294.4 Mean : 0.2127 Mean : 0.8724 Mean : 16.15
## 3rd Qu.: 342.5 3rd Qu.: 0.1590 3rd Qu.: 0.3130 3rd Qu.: 25.10
## Max. :6040.0 Max. :11.8650 Max. :133.0000 Max. :311.50
## NA's :41 NA's :128 NA's :225 NA's :171
## Vit_C_mg Thiamin_mg Riboflavin_mg Energ_Kcal_sqrt
## Min. : 0.0 Min. : 0.0000 Min. :0.0000 Min. : 0.00
## 1st Qu.: 0.0 1st Qu.: 0.0370 1st Qu.:0.0740 1st Qu.: 9.26
## Median : 0.0 Median : 0.0800 Median :0.1750 Median :14.14
## Mean : 10.7 Mean : 0.2293 Mean :0.2667 Mean :13.94
## 3rd Qu.: 2.8 3rd Qu.: 0.2362 3rd Qu.:0.2800 3rd Qu.:18.47
## Max. :2400.0 Max. :11.2000 Max. :6.8000 Max. :30.03
## NA's :80 NA's :60 NA's :59
mystats <- function(x, na.omit=FALSE){
if (na.omit)
x <- x[!is.na(x)]
m <- mean(x)
n <- length(x)
s <- sd(x)
min <- min(x)
max <- max(x)
return(c(n=n, mean=m, stdev=s,
min=min, max=max))}
dstats <- function(x)sapply(x, mystats, na.omit=TRUE)
myvars <- c(colnames(nutrition2)) # only numeric or integer columns
Descriptive_stats <- dstats(nutrition2[myvars])
round(Descriptive_stats,3)
## Energ_Kcal Lipid_Tot_g Carbohydrt_g Fiber_TD_g Calcium_mg Iron_mg
## n 1000.000 1000.000 1000.000 929.000 966.000 986.000
## mean 228.832 11.141 20.442 2.178 78.272 2.840
## stdev 175.006 17.013 26.637 4.602 176.553 6.689
## min 0.000 0.000 0.000 0.000 0.000 0.000
## max 902.000 100.000 100.000 42.800 2240.000 89.800
## Magnesium_mg Phosphorus_mg Potassium_mg Copper_mg Manganese_mg Selenium_g
## n 926.000 939.000 959.000 872.000 775.000 829.000
## mean 38.843 172.896 294.436 0.213 0.872 16.149
## stdev 69.341 169.000 389.099 0.584 7.033 21.265
## min 0.000 0.000 0.000 0.000 0.000 0.000
## max 781.000 1677.000 6040.000 11.865 133.000 311.500
## Vit_C_mg Thiamin_mg Riboflavin_mg Energ_Kcal_sqrt
## n 920.000 940.000 941.000 1000.000
## mean 10.698 0.229 0.267 13.940
## stdev 101.978 0.607 0.445 5.878
## min 0.000 0.000 0.000 0.000
## max 2400.000 11.200 6.800 30.033
# Part 3 - check for outliers and remove
# Function to remove outliers using z-score method
remove_outliers <- function(df, cols, threshold = 3) {
# Calculate z-scores for specified columns
z_scores <- scale(df[, cols])
# Identify rows with z-scores exceeding the threshold
outliers <- apply(abs(z_scores) > threshold, 1, any)
# Remove rows with outliers
df <- df[!outliers, ]
return(df)
}
# Define columns to check for outliers
cols_to_check <- c("Energ_Kcal", "Lipid_Tot_g", "Carbohydrt_g", "Fiber_TD_g", "Calcium_mg", "Iron_mg", "Magnesium_mg", "Phosphorus_mg",
"Potassium_mg", "Copper_mg", "Manganese_mg", "Selenium_g",
"Vit_C_mg", "Thiamin_mg", "Riboflavin_mg", "Energ_Kcal_sqrt")
# Remove outliers using z-score method
nutrition2 <- remove_outliers(nutrition2, cols_to_check)
# Check the dimensions of the new dataset
dim(nutrition2)
## [1] 880 16
plot_histogram(nutrition2)
# Re-Check
# Create histogram
hist(nutrition2$Energ_Kcal_sqrt)
# Plot QQ plot
qqnorm(nutrition2$Energ_Kcal_sqrt)
qqline(nutrition2$Energ_Kcal_sqrt)
rownames(nutrition2) <- NULL
Below I generated a correlation matrix to compare the selected independent variables to our response variable of ‘Energ_Kcal_sqrt’ or total calories for a food. This correlation matrix comes with a color coded legend from -1 to 1. Where blue represents a positive correlation and red a negative correlation to our response variable. The magnitude of the circles size indicates how positively or negatively correlated the given independent variable is with the aligning variable in the matrix. Since we are primarily concerned with our dependent variable, calories, we can see Lipids and Carbohydrates are the most highly correlated with calories.
I then went on to generate a scatter plot matrix removing some of the less correlated variables. I removed fiber, calcium, vitamin C, and potassium. I then went on to fit the regression model with these less correlated variables removed.
# 4 correlation matrix
cors <- cor(nutrition2, use="pairwise")
corrplot(cors, type='upper', col = brewer.pal(n=8, name="RdYlBu"))
# 5 make a scatter plot
most_corr <- nutrition2 |> select(Energ_Kcal_sqrt, Lipid_Tot_g,Carbohydrt_g,Iron_mg,Magnesium_mg,Phosphorus_mg, Copper_mg, Manganese_mg, Selenium_g, Thiamin_mg, Riboflavin_mg)
suppressWarnings({
scatterplotMatrix(most_corr, spread = FALSE, smoother.args = list(lty=2), main="Scatterplot Matrix")
})
After the regression model was fitted to exclude the less correlated variables as stated above I produced a summary of the model. Observing the p-values produced for each variable I could see there were several that weren’t statistically significant. These included magnesium, thiamin, and riboflavin. I opted not to remove them since when performing feature selection it will automatically account for this variables and will exclude them as needed. This model has an adjusted R-squared value of .914 or 91.4%. Adjusted R-squared tells us that 91.4% of the variance in the dependent variable, calories, can be explained by the independent variables in the model. It also tells us how well the data fits the model. In this case it appears the data is fitting the model well. I then looked at the models AIC which was at 2051.036 and BIC at 2103.251. This number gives an indication of the trade off between the models complexity and it’s goodness of fit. The lower both these numbers are the better the model is in comparison. For the coefficients summarized in our model we can see most are slightly positive in the range of zero to 0.35. These represent the slopes for each predictor variable in relation to the response variable. For copper, magnesium, and thiamin, we can see each coefficient is negative indicating a negative relationship between this predictor variable and our response variable calories. The highest of these being manganese at approximately -0.8. This variables can still be of value to the model even though they are negative this can aid in our models ability to predict calories based on the presenses of these nutrients.
# Part 5 continued and Part 6
# ----------------------------------------
# Fit initial model
# ----------------------------------------
model <- lm(Energ_Kcal_sqrt ~ Lipid_Tot_g+Carbohydrt_g+Iron_mg+Magnesium_mg+Phosphorus_mg+Copper_mg+ Manganese_mg+ Selenium_g+ Thiamin_mg+ Riboflavin_mg, data = nutrition2)
summary(model)
##
## Call:
## lm(formula = Energ_Kcal_sqrt ~ Lipid_Tot_g + Carbohydrt_g + Iron_mg +
## Magnesium_mg + Phosphorus_mg + Copper_mg + Manganese_mg +
## Selenium_g + Thiamin_mg + Riboflavin_mg, data = nutrition2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.0316 -0.9149 0.2384 0.8884 9.7002
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.092956 0.137672 44.257 < 2e-16 ***
## Lipid_Tot_g 0.322802 0.007585 42.561 < 2e-16 ***
## Carbohydrt_g 0.125374 0.003586 34.958 < 2e-16 ***
## Iron_mg 0.046241 0.053246 0.868 0.385503
## Magnesium_mg -0.013703 0.003638 -3.766 0.000182 ***
## Phosphorus_mg 0.009138 0.001018 8.974 < 2e-16 ***
## Copper_mg -1.633852 0.498853 -3.275 0.001117 **
## Manganese_mg 0.020683 0.077021 0.269 0.788380
## Selenium_g 0.066210 0.006132 10.797 < 2e-16 ***
## Thiamin_mg -1.305326 0.453525 -2.878 0.004144 **
## Riboflavin_mg 2.275717 0.615804 3.696 0.000240 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.66 on 594 degrees of freedom
## (275 observations deleted due to missingness)
## Multiple R-squared: 0.8946, Adjusted R-squared: 0.8929
## F-statistic: 504.3 on 10 and 594 DF, p-value: < 2.2e-16
summary(model)$adj.r.squared
## [1] 0.8928598
AIC(model)
## [1] 2342.703
BIC(model)
## [1] 2395.565
Below I generated several regression diagnostic plots to help interpret the current model. The first plot produced was a residual vs fitted plot. In this case it is preferable for the data to be spread out randomly making an even line across the data. The line produced follows more curvature than a flat line. This suggests the relationship between the predictor variable and the response is not adequately captured by this current model. For the Q-Q Residuals plot it can seen a decent portion of the data falls on the qq-line but on both tails a significant amount of the data is off the line. This shows that the normality isn’t consistent across the model. Potentially more transformations are needed for predictor variables so more follow a normal distribution. For the scale-location plot it checks for homoscadacity which should also align with a line that goes across evenly through the data. The data is fairly even but there is a slight dip in the line across fitted values by standard residuals. Residuals vs Leverage plot also known as the Cook’s distance plot. This is used to identify influential observations. We will address this later on in the analysis for these points.
## null device
## 1
## [1] 157 362
##
## Suggested power transformation: 1.342868
Next, I checked for multicolinearity. This was done through checking VIF for each independent variable. VIF measures how much the variance of the estimated regression coefficients is inflated due to multicollinearity among predictor variables. For variables that are greater than 2.5 were in danger of multicolinearity. In this model the VIF for magnesium was 2.66 and phosphorus was 2.94. These are two variables that can be removed in the process of correcting the model. These numbers represent the severity of of multicollinearity in the regression analysis.
# Part 8
# ----------------------------------------
# Calculate VIF
# ----------------------------------------
vif_values <- vif(model)
vif_values # remove those greater than 2.5 i.e. magnesium and phosphorus
## Lipid_Tot_g Carbohydrt_g Iron_mg Magnesium_mg Phosphorus_mg
## 1.266290 1.485668 1.762100 2.182890 2.545208
## Copper_mg Manganese_mg Selenium_g Thiamin_mg Riboflavin_mg
## 1.767360 1.342521 1.854357 1.616390 2.006117
Following our analysis above we can start applying what we learned to improve the model. First we look at Cook’s distance. This is a tool used to identify and eliminate unusual values present. This can help remove higher leveraged points that impact correlation of the data and how it impacts the model. I then proceeded to remove the identified points from Cook’s distance. This was applied to a new data frame. I then went on to create a second model to attempt to improve it from the original. I removed magnesium and phosphorus the two variables identified that could be susceptible to multicollinearity and applied the new data from removing unusual variables from Cook’s distance.
I then summarized the new model and obtained a new adjusted R-squared of 0.9016 or 90.16%. Then looking at the AIC and BIC again I obtained values of 1879.346 and 1925.968. Both of these values reduced from the previous model. This indicates an improvement in the goodness of fit of the model.
Lastly, with my new model I performed feature selection to determine which variables would and wouldn’t improve the model. These are made up of three types of processes backward, forward, and both feature selection. When testing all these forms the one that performed best was step wise feature selection using both processes combined. It returned an AIC value of 420.41. This included variables copper, thiamin, manganese, riboflavin, iron, vitamin C, selenium, carbohydrates, and lipids. It appears the model did improve based on BIC and AIC compared to the original model. This is likely attributed to the removal of VIF over 2.5.# Part 9
# ----------------------------------------
# Calculate Cook's distance
# This will be used to identify and eliminate unusual values
# ----------------------------------------
cooks_distance <- cooks.distance(model)
# ----------------------------------------
# Remove rows based on Cook's distance
# The rule of thumb of 4/n is used below
# ----------------------------------------
nutrition_filtered <- nutrition2[cooks_distance < 4/nrow(nutrition2), ]
dim(nutrition_filtered)
## [1] 816 16
# ----------------------------------------
# Remove missing values from the remaining data
# ----------------------------------------
nutrition_filtered <- na.omit(nutrition_filtered)
dim(nutrition_filtered)
## [1] 561 16
# ----------------------------------------
# Fit model on reduced data
# ----------------------------------------
# From VIF we want to remove Phosphorus and Magnesium > 2.5. stepAIC will remove insignificant values with a p-value greater than .05.
model2 <- lm(Energ_Kcal_sqrt ~ Lipid_Tot_g+Carbohydrt_g+Iron_mg+Copper_mg+Manganese_mg+Selenium_g+ Vit_C_mg+Thiamin_mg+Riboflavin_mg, data = nutrition_filtered)
summary(model2)
##
## Call:
## lm(formula = Energ_Kcal_sqrt ~ Lipid_Tot_g + Carbohydrt_g + Iron_mg +
## Copper_mg + Manganese_mg + Selenium_g + Vit_C_mg + Thiamin_mg +
## Riboflavin_mg, data = nutrition_filtered)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.7354 -0.9681 0.0657 0.9772 9.0887
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.733933 0.153229 43.947 < 2e-16 ***
## Lipid_Tot_g 0.336055 0.008270 40.636 < 2e-16 ***
## Carbohydrt_g 0.117936 0.003693 31.931 < 2e-16 ***
## Iron_mg 0.054887 0.057599 0.953 0.34105
## Copper_mg -1.550748 0.475667 -3.260 0.00118 **
## Manganese_mg 0.033108 0.080469 0.411 0.68092
## Selenium_g 0.086959 0.005851 14.861 < 2e-16 ***
## Vit_C_mg -0.031017 0.005230 -5.931 5.32e-09 ***
## Thiamin_mg -0.993761 0.490143 -2.027 0.04309 *
## Riboflavin_mg 3.168965 0.653339 4.850 1.61e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.731 on 551 degrees of freedom
## Multiple R-squared: 0.8855, Adjusted R-squared: 0.8836
## F-statistic: 473.5 on 9 and 551 DF, p-value: < 2.2e-16
summary(model2)$adj.r.squared
## [1] 0.8836433
AIC(model2)
## [1] 2219.713
BIC(model2)
## [1] 2267.34
# ----------------------------------------
# Perform forward selection using stepAIC
# ----------------------------------------
stepAIC(model2, direction = "backward")
## Start: AIC=625.66
## Energ_Kcal_sqrt ~ Lipid_Tot_g + Carbohydrt_g + Iron_mg + Copper_mg +
## Manganese_mg + Selenium_g + Vit_C_mg + Thiamin_mg + Riboflavin_mg
##
## Df Sum of Sq RSS AIC
## - Manganese_mg 1 0.5 1651.8 623.84
## - Iron_mg 1 2.7 1654.1 624.59
## <none> 1651.3 625.66
## - Thiamin_mg 1 12.3 1663.6 627.83
## - Copper_mg 1 31.9 1683.2 634.38
## - Riboflavin_mg 1 70.5 1721.8 647.12
## - Vit_C_mg 1 105.4 1756.8 658.38
## - Selenium_g 1 661.9 2313.2 812.76
## - Carbohydrt_g 1 3055.7 4707.0 1211.30
## - Lipid_Tot_g 1 4948.8 6600.1 1400.93
##
## Step: AIC=623.84
## Energ_Kcal_sqrt ~ Lipid_Tot_g + Carbohydrt_g + Iron_mg + Copper_mg +
## Selenium_g + Vit_C_mg + Thiamin_mg + Riboflavin_mg
##
## Df Sum of Sq RSS AIC
## - Iron_mg 1 3.0 1654.9 622.87
## <none> 1651.8 623.84
## - Thiamin_mg 1 12.0 1663.8 625.89
## - Copper_mg 1 31.8 1683.7 632.54
## - Riboflavin_mg 1 70.1 1721.9 645.14
## - Vit_C_mg 1 106.1 1758.0 656.78
## - Selenium_g 1 662.7 2314.5 811.07
## - Carbohydrt_g 1 3177.6 4829.4 1223.70
## - Lipid_Tot_g 1 4981.8 6633.7 1401.78
##
## Step: AIC=622.87
## Energ_Kcal_sqrt ~ Lipid_Tot_g + Carbohydrt_g + Copper_mg + Selenium_g +
## Vit_C_mg + Thiamin_mg + Riboflavin_mg
##
## Df Sum of Sq RSS AIC
## <none> 1654.9 622.87
## - Thiamin_mg 1 12.3 1667.1 625.01
## - Copper_mg 1 29.4 1684.3 630.75
## - Vit_C_mg 1 105.0 1759.8 655.37
## - Riboflavin_mg 1 107.1 1761.9 656.03
## - Selenium_g 1 681.0 2335.9 814.22
## - Carbohydrt_g 1 3402.2 5057.1 1247.54
## - Lipid_Tot_g 1 5017.6 6672.5 1403.05
##
## Call:
## lm(formula = Energ_Kcal_sqrt ~ Lipid_Tot_g + Carbohydrt_g + Copper_mg +
## Selenium_g + Vit_C_mg + Thiamin_mg + Riboflavin_mg, data = nutrition_filtered)
##
## Coefficients:
## (Intercept) Lipid_Tot_g Carbohydrt_g Copper_mg Selenium_g
## 6.73451 0.33627 0.11905 -1.42311 0.08744
## Vit_C_mg Thiamin_mg Riboflavin_mg
## -0.03091 -0.98674 3.45828
# ----------------------------------------
# Perform "forward" selection using stepAIC
# ----------------------------------------
stepAIC(model2, direction = "forward")
## Start: AIC=625.66
## Energ_Kcal_sqrt ~ Lipid_Tot_g + Carbohydrt_g + Iron_mg + Copper_mg +
## Manganese_mg + Selenium_g + Vit_C_mg + Thiamin_mg + Riboflavin_mg
##
## Call:
## lm(formula = Energ_Kcal_sqrt ~ Lipid_Tot_g + Carbohydrt_g + Iron_mg +
## Copper_mg + Manganese_mg + Selenium_g + Vit_C_mg + Thiamin_mg +
## Riboflavin_mg, data = nutrition_filtered)
##
## Coefficients:
## (Intercept) Lipid_Tot_g Carbohydrt_g Iron_mg Copper_mg
## 6.73393 0.33605 0.11794 0.05489 -1.55075
## Manganese_mg Selenium_g Vit_C_mg Thiamin_mg Riboflavin_mg
## 0.03311 0.08696 -0.03102 -0.99376 3.16896
# ----------------------------------------
# Perform "both" selection using stepAIC
# ----------------------------------------
stepAIC(model2, direction = "both")
## Start: AIC=625.66
## Energ_Kcal_sqrt ~ Lipid_Tot_g + Carbohydrt_g + Iron_mg + Copper_mg +
## Manganese_mg + Selenium_g + Vit_C_mg + Thiamin_mg + Riboflavin_mg
##
## Df Sum of Sq RSS AIC
## - Manganese_mg 1 0.5 1651.8 623.84
## - Iron_mg 1 2.7 1654.1 624.59
## <none> 1651.3 625.66
## - Thiamin_mg 1 12.3 1663.6 627.83
## - Copper_mg 1 31.9 1683.2 634.38
## - Riboflavin_mg 1 70.5 1721.8 647.12
## - Vit_C_mg 1 105.4 1756.8 658.38
## - Selenium_g 1 661.9 2313.2 812.76
## - Carbohydrt_g 1 3055.7 4707.0 1211.30
## - Lipid_Tot_g 1 4948.8 6600.1 1400.93
##
## Step: AIC=623.84
## Energ_Kcal_sqrt ~ Lipid_Tot_g + Carbohydrt_g + Iron_mg + Copper_mg +
## Selenium_g + Vit_C_mg + Thiamin_mg + Riboflavin_mg
##
## Df Sum of Sq RSS AIC
## - Iron_mg 1 3.0 1654.9 622.87
## <none> 1651.8 623.84
## + Manganese_mg 1 0.5 1651.3 625.66
## - Thiamin_mg 1 12.0 1663.8 625.89
## - Copper_mg 1 31.8 1683.7 632.54
## - Riboflavin_mg 1 70.1 1721.9 645.14
## - Vit_C_mg 1 106.1 1758.0 656.78
## - Selenium_g 1 662.7 2314.5 811.07
## - Carbohydrt_g 1 3177.6 4829.4 1223.70
## - Lipid_Tot_g 1 4981.8 6633.7 1401.78
##
## Step: AIC=622.87
## Energ_Kcal_sqrt ~ Lipid_Tot_g + Carbohydrt_g + Copper_mg + Selenium_g +
## Vit_C_mg + Thiamin_mg + Riboflavin_mg
##
## Df Sum of Sq RSS AIC
## <none> 1654.9 622.87
## + Iron_mg 1 3.0 1651.8 623.84
## + Manganese_mg 1 0.8 1654.1 624.59
## - Thiamin_mg 1 12.3 1667.1 625.01
## - Copper_mg 1 29.4 1684.3 630.75
## - Vit_C_mg 1 105.0 1759.8 655.37
## - Riboflavin_mg 1 107.1 1761.9 656.03
## - Selenium_g 1 681.0 2335.9 814.22
## - Carbohydrt_g 1 3402.2 5057.1 1247.54
## - Lipid_Tot_g 1 5017.6 6672.5 1403.05
##
## Call:
## lm(formula = Energ_Kcal_sqrt ~ Lipid_Tot_g + Carbohydrt_g + Copper_mg +
## Selenium_g + Vit_C_mg + Thiamin_mg + Riboflavin_mg, data = nutrition_filtered)
##
## Coefficients:
## (Intercept) Lipid_Tot_g Carbohydrt_g Copper_mg Selenium_g
## 6.73451 0.33627 0.11905 -1.42311 0.08744
## Vit_C_mg Thiamin_mg Riboflavin_mg
## -0.03091 -0.98674 3.45828
Using best subset we tested all versions of model to see which fit the data best. We set it to four to get the four best models of that type. It performs it for there were 14 predictors in the model. We generated a plot to visualize which model would work well. It can be read that the black bars that go to the top include the variables that would improve the adjusted R-squared. I then created a Original vs Predicted scatter plot. This plot can be read similarly to our qq-plot for normality generated earlier. It shows the majority of the data points following the line with some points moving off on the end of that tails. Overall, the model fit fairly well but there is room for improvement in several areas to have a more accurate model for predicting our response variable, calories.
# Best Subset
library(leaps)
## Warning: package 'leaps' was built under R version 4.3.3
# Perform all subsets regression
leaps <- regsubsets(Energ_Kcal_sqrt ~ ., data = nutrition_filtered, nbest=4)
plot(leaps, scale = "adjr2")
# review results
summary(leaps)
## Subset selection object
## Call: regsubsets.formula(Energ_Kcal_sqrt ~ ., data = nutrition_filtered,
## nbest = 4)
## 15 Variables (and intercept)
## Forced in Forced out
## Energ_Kcal FALSE FALSE
## Lipid_Tot_g FALSE FALSE
## Carbohydrt_g FALSE FALSE
## Fiber_TD_g FALSE FALSE
## Calcium_mg FALSE FALSE
## Iron_mg FALSE FALSE
## Magnesium_mg FALSE FALSE
## Phosphorus_mg FALSE FALSE
## Potassium_mg FALSE FALSE
## Copper_mg FALSE FALSE
## Manganese_mg FALSE FALSE
## Selenium_g FALSE FALSE
## Vit_C_mg FALSE FALSE
## Thiamin_mg FALSE FALSE
## Riboflavin_mg FALSE FALSE
## 4 subsets of each size up to 8
## Selection Algorithm: exhaustive
## Energ_Kcal Lipid_Tot_g Carbohydrt_g Fiber_TD_g Calcium_mg Iron_mg
## 1 ( 1 ) "*" " " " " " " " " " "
## 1 ( 2 ) " " "*" " " " " " " " "
## 1 ( 3 ) " " " " "*" " " " " " "
## 1 ( 4 ) " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " "
## 2 ( 2 ) "*" " " " " " " " " " "
## 2 ( 3 ) "*" " " " " "*" " " " "
## 2 ( 4 ) "*" " " "*" " " " " " "
## 3 ( 1 ) "*" "*" "*" " " " " " "
## 3 ( 2 ) "*" " " " " "*" " " " "
## 3 ( 3 ) "*" " " " " " " " " " "
## 3 ( 4 ) "*" " " " " " " " " " "
## 4 ( 1 ) "*" "*" "*" " " " " " "
## 4 ( 2 ) "*" "*" "*" " " " " " "
## 4 ( 3 ) "*" "*" "*" " " " " " "
## 4 ( 4 ) "*" "*" "*" " " " " " "
## 5 ( 1 ) "*" "*" "*" " " " " " "
## 5 ( 2 ) "*" "*" "*" " " " " " "
## 5 ( 3 ) "*" "*" "*" " " " " " "
## 5 ( 4 ) "*" "*" "*" "*" " " " "
## 6 ( 1 ) "*" "*" "*" " " " " " "
## 6 ( 2 ) "*" "*" "*" " " " " " "
## 6 ( 3 ) "*" "*" "*" " " " " " "
## 6 ( 4 ) "*" "*" "*" " " " " " "
## 7 ( 1 ) "*" "*" "*" " " " " " "
## 7 ( 2 ) "*" "*" "*" " " " " " "
## 7 ( 3 ) "*" "*" "*" " " " " " "
## 7 ( 4 ) "*" "*" "*" "*" " " " "
## 8 ( 1 ) "*" "*" "*" " " " " " "
## 8 ( 2 ) "*" "*" "*" " " " " " "
## 8 ( 3 ) "*" "*" "*" " " " " " "
## 8 ( 4 ) "*" "*" "*" "*" " " " "
## Magnesium_mg Phosphorus_mg Potassium_mg Copper_mg Manganese_mg
## 1 ( 1 ) " " " " " " " " " "
## 1 ( 2 ) " " " " " " " " " "
## 1 ( 3 ) " " " " " " " " " "
## 1 ( 4 ) " " "*" " " " " " "
## 2 ( 1 ) " " " " " " " " " "
## 2 ( 2 ) " " "*" " " " " " "
## 2 ( 3 ) " " " " " " " " " "
## 2 ( 4 ) " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " "
## 3 ( 2 ) " " " " " " " " " "
## 3 ( 3 ) " " " " " " "*" " "
## 3 ( 4 ) "*" "*" " " " " " "
## 4 ( 1 ) " " " " " " " " " "
## 4 ( 2 ) " " " " " " "*" " "
## 4 ( 3 ) " " " " " " " " " "
## 4 ( 4 ) " " "*" " " " " " "
## 5 ( 1 ) "*" "*" " " " " " "
## 5 ( 2 ) " " "*" " " "*" " "
## 5 ( 3 ) " " " " " " "*" " "
## 5 ( 4 ) " " "*" " " " " " "
## 6 ( 1 ) "*" "*" " " " " " "
## 6 ( 2 ) "*" "*" " " "*" " "
## 6 ( 3 ) "*" "*" " " " " " "
## 6 ( 4 ) " " "*" " " "*" " "
## 7 ( 1 ) "*" "*" " " "*" " "
## 7 ( 2 ) "*" "*" " " " " " "
## 7 ( 3 ) "*" "*" " " " " " "
## 7 ( 4 ) "*" "*" " " " " " "
## 8 ( 1 ) "*" "*" " " "*" " "
## 8 ( 2 ) "*" "*" " " "*" " "
## 8 ( 3 ) "*" "*" " " " " " "
## 8 ( 4 ) "*" "*" " " " " " "
## Selenium_g Vit_C_mg Thiamin_mg Riboflavin_mg
## 1 ( 1 ) " " " " " " " "
## 1 ( 2 ) " " " " " " " "
## 1 ( 3 ) " " " " " " " "
## 1 ( 4 ) " " " " " " " "
## 2 ( 1 ) "*" " " " " " "
## 2 ( 2 ) " " " " " " " "
## 2 ( 3 ) " " " " " " " "
## 2 ( 4 ) " " " " " " " "
## 3 ( 1 ) " " " " " " " "
## 3 ( 2 ) "*" " " " " " "
## 3 ( 3 ) "*" " " " " " "
## 3 ( 4 ) " " " " " " " "
## 4 ( 1 ) "*" " " " " " "
## 4 ( 2 ) " " " " " " " "
## 4 ( 3 ) " " "*" " " " "
## 4 ( 4 ) " " " " " " " "
## 5 ( 1 ) " " " " " " " "
## 5 ( 2 ) " " " " " " " "
## 5 ( 3 ) "*" " " " " " "
## 5 ( 4 ) " " " " " " " "
## 6 ( 1 ) " " "*" " " " "
## 6 ( 2 ) " " " " " " " "
## 6 ( 3 ) "*" " " " " " "
## 6 ( 4 ) " " "*" " " " "
## 7 ( 1 ) " " "*" " " " "
## 7 ( 2 ) "*" "*" " " " "
## 7 ( 3 ) " " "*" " " "*"
## 7 ( 4 ) " " "*" " " " "
## 8 ( 1 ) "*" "*" " " " "
## 8 ( 2 ) " " "*" " " "*"
## 8 ( 3 ) "*" "*" " " "*"
## 8 ( 4 ) "*" "*" " " " "
# Get the summary of all subsets regression
summary_leaps <- summary(leaps)
# Get the best model based on adjusted R-squared
best_model_index <- which.max(summary_leaps$adjr2)
best_predictors <- names(summary_leaps$outmat[best_model_index, ])
# Fit the best model
best_model_fit <- lm(Energ_Kcal_sqrt ~ ., data = nutrition_filtered[, c("Energ_Kcal_sqrt", best_predictors)])
# Get predictions
predictions <- predict(best_model_fit)
# Create scatterplot of Original vs Predicted response
plot(nutrition_filtered$Energ_Kcal_sqrt, predictions, xlab = "Original Response", ylab = "Predicted Response", main = "Original vs Predicted Response")
abline(a = 0, b = 1, col = "red") # Add a line of equality
Kabacoff, R. I. (2015). R in Action (2nd ed.). Manning Publications.
Bluman, A. G. (2018). Elementary statistics: A step by step approach (10th ed.). McGraw Hill.