# Load/clean data
data <- read.csv("real_stuff.csv")
data$Vitamin.A..IU <- NULL
data$Fatty.acids..total.monounsaturated <- NULL
data$Fatty.acids..total.polyunsaturated <- NULL
data$Fatty.acids..total.trans.monoenoic <- NULL
data$Fatty.acids..total.trans.polyenoic <- NULL
id.group <- read.csv("id_group.csv")
merged <- merge(id.group, data)
# No food group data
fit.no.group <- lm(Magnesium..Mg ~ . - food.id, data)
summary(fit.no.group)
##
## Call:
## lm(formula = Magnesium..Mg ~ . - food.id, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -276.8 -16.4 -4.1 8.9 470.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.42e+01 4.90e+00 -4.94 9.6e-07 ***
## Protein 2.39e+00 2.06e-01 11.62 < 2e-16 ***
## Total.lipid..fat. 8.45e-01 1.20e-01 7.04 4.1e-12 ***
## Carbohydrate..by.difference 3.15e-01 9.93e-02 3.18 0.0016 **
## Sugars..total 2.04e-01 1.51e-01 1.36 0.1754
## Fiber..total.dietary 5.67e+00 3.75e-01 15.14 < 2e-16 ***
## Calcium..Ca 2.33e-02 8.50e-03 2.74 0.0063 **
## Iron..Fe -1.35e-01 2.57e-01 -0.53 0.5980
## Sodium..Na -9.25e-03 3.75e-03 -2.47 0.0138 *
## Vitamin.A..RAE -1.03e-03 9.23e-04 -1.12 0.2642
## Vitamin.C..total.ascorbic.acid 3.84e-01 6.48e-02 5.93 4.6e-09 ***
## Cholesterol -1.55e-02 9.57e-03 -1.62 0.1050
## Fatty.acids..total.trans -1.64e+00 5.35e-01 -3.06 0.0023 **
## Fatty.acids..total.saturated -7.24e-01 2.20e-01 -3.29 0.0011 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 42.6 on 791 degrees of freedom
## (6733 observations deleted due to missingness)
## Multiple R-squared: 0.507, Adjusted R-squared: 0.499
## F-statistic: 62.5 on 13 and 791 DF, p-value: <2e-16
##
fit.no.group2 <- lm(Magnesium..Mg ~ . - food.id - Sugars..total -
Iron..Fe - Vitamin.A..RAE - Cholesterol, data)
summary(fit.no.group2)
##
## Call:
## lm(formula = Magnesium..Mg ~ . - food.id - Sugars..total - Iron..Fe -
## Vitamin.A..RAE - Cholesterol, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -274.8 -16.2 -3.6 9.4 472.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -24.73264 4.83921 -5.11 4.0e-07 ***
## Protein 2.30238 0.20319 11.33 < 2e-16 ***
## Total.lipid..fat. 0.84787 0.12003 7.06 3.5e-12 ***
## Carbohydrate..by.difference 0.39812 0.06345 6.27 5.8e-10 ***
## Fiber..total.dietary 5.52282 0.35172 15.70 < 2e-16 ***
## Calcium..Ca 0.02278 0.00825 2.76 0.0059 **
## Sodium..Na -0.00923 0.00370 -2.49 0.0128 *
## Vitamin.C..total.ascorbic.acid 0.36746 0.06087 6.04 2.4e-09 ***
## Fatty.acids..total.trans -1.67752 0.53500 -3.14 0.0018 **
## Fatty.acids..total.saturated -0.68258 0.21922 -3.11 0.0019 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 42.7 on 795 degrees of freedom
## (6733 observations deleted due to missingness)
## Multiple R-squared: 0.502, Adjusted R-squared: 0.496
## F-statistic: 89 on 9 and 795 DF, p-value: <2e-16
##
# No food group data
fit <- lm(Magnesium..Mg ~ . - X - food.id, merged)
summary(fit)
##
## Call:
## lm(formula = Magnesium..Mg ~ . - X - food.id, data = merged)
##
## Residuals:
## Min 1Q Median 3Q Max
## -220.84 -12.79 -0.04 10.50 285.92
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 6.51e+00 1.50e+01 0.43
## food.groupBaked Products 1.14e+01 1.51e+01 0.75
## food.groupBeef Products -6.27e+01 1.57e+01 -4.01
## food.groupBeverages 2.42e+01 1.89e+01 1.28
## food.groupBreakfast Cereals 1.95e+01 1.42e+01 1.37
## food.groupCereal Grains and Pasta 1.05e+01 1.54e+01 0.68
## food.groupDairy and Egg Products -1.65e+01 2.86e+01 -0.58
## food.groupEthnic Foods -8.04e+01 3.81e+01 -2.11
## food.groupFast Foods -2.59e+01 1.51e+01 -1.72
## food.groupFats and Oils -2.04e+01 1.68e+01 -1.21
## food.groupFinfish and Shellfish Products -1.39e+01 3.74e+01 -0.37
## food.groupFruits and Fruit Juices -1.12e+01 2.85e+01 -0.39
## food.groupLamb, Veal, and Game Products -5.88e+01 2.35e+01 -2.50
## food.groupLegumes and Legume Products -3.31e+01 1.52e+01 -2.17
## food.groupMeals, Entrees, and Sidedishes -1.56e+01 3.73e+01 -0.42
## food.groupNut and Seed Products 1.60e+02 1.86e+01 8.61
## food.groupPork Products -5.57e+01 1.53e+01 -3.63
## food.groupPoultry Products -5.92e+01 1.60e+01 -3.70
## food.groupSausages and Luncheon Meats -4.17e+01 1.64e+01 -2.54
## food.groupSnacks 5.28e+01 1.45e+01 3.63
## food.groupSoups, Sauces, and Gravies 1.59e+00 1.62e+01 0.10
## food.groupSpices and Herbs 4.76e+01 1.87e+01 2.55
## food.groupSweets 1.85e+01 1.55e+01 1.19
## food.groupVegetables and Vegetable Products -2.44e+00 1.73e+01 -0.14
## Protein 3.21e+00 2.51e-01 12.80
## Total.lipid..fat. 3.72e-01 1.26e-01 2.96
## Carbohydrate..by.difference -4.29e-01 1.39e-01 -3.08
## Sugars..total 2.28e-01 1.75e-01 1.30
## Fiber..total.dietary 4.35e+00 3.38e-01 12.84
## Calcium..Ca 1.23e-02 7.30e-03 1.69
## Iron..Fe 4.39e-01 2.62e-01 1.68
## Sodium..Na -4.78e-03 3.67e-03 -1.30
## Vitamin.A..RAE -6.89e-04 8.69e-04 -0.79
## Vitamin.C..total.ascorbic.acid 2.70e-01 5.59e-02 4.84
## Cholesterol 2.18e-03 8.08e-03 0.27
## Fatty.acids..total.trans -5.60e-01 4.49e-01 -1.25
## Fatty.acids..total.saturated -2.96e-01 1.82e-01 -1.63
## Pr(>|t|)
## (Intercept) 0.66382
## food.groupBaked Products 0.45058
## food.groupBeef Products 6.7e-05 ***
## food.groupBeverages 0.19962
## food.groupBreakfast Cereals 0.16973
## food.groupCereal Grains and Pasta 0.49482
## food.groupDairy and Egg Products 0.56251
## food.groupEthnic Foods 0.03525 *
## food.groupFast Foods 0.08586 .
## food.groupFats and Oils 0.22646
## food.groupFinfish and Shellfish Products 0.70998
## food.groupFruits and Fruit Juices 0.69380
## food.groupLamb, Veal, and Game Products 0.01249 *
## food.groupLegumes and Legume Products 0.02998 *
## food.groupMeals, Entrees, and Sidedishes 0.67548
## food.groupNut and Seed Products < 2e-16 ***
## food.groupPork Products 0.00030 ***
## food.groupPoultry Products 0.00023 ***
## food.groupSausages and Luncheon Meats 0.01134 *
## food.groupSnacks 0.00030 ***
## food.groupSoups, Sauces, and Gravies 0.92147
## food.groupSpices and Herbs 0.01095 *
## food.groupSweets 0.23320
## food.groupVegetables and Vegetable Products 0.88820
## Protein < 2e-16 ***
## Total.lipid..fat. 0.00319 **
## Carbohydrate..by.difference 0.00215 **
## Sugars..total 0.19261
## Fiber..total.dietary < 2e-16 ***
## Calcium..Ca 0.09150 .
## Iron..Fe 0.09410 .
## Sodium..Na 0.19338
## Vitamin.A..RAE 0.42784
## Vitamin.C..total.ascorbic.acid 1.6e-06 ***
## Cholesterol 0.78695
## Fatty.acids..total.trans 0.21295
## Fatty.acids..total.saturated 0.10373
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34.7 on 768 degrees of freedom
## (6733 observations deleted due to missingness)
## Multiple R-squared: 0.682, Adjusted R-squared: 0.667
## F-statistic: 45.7 on 36 and 768 DF, p-value: <2e-16
##
fit2 <- lm(Magnesium..Mg ~ . - X - food.id - Sodium..Na - Sugars..total -
Iron..Fe - Vitamin.A..RAE - Cholesterol - Fatty.acids..total.trans - Fatty.acids..total.saturated -
Vitamin.A..RAE, merged)
summary(fit2)
##
## Call:
## lm(formula = Magnesium..Mg ~ . - X - food.id - Sodium..Na - Sugars..total -
## Iron..Fe - Vitamin.A..RAE - Cholesterol - Fatty.acids..total.trans -
## Fatty.acids..total.saturated - Vitamin.A..RAE, data = merged)
##
## Residuals:
## Min 1Q Median 3Q Max
## -224.59 -11.60 -2.16 10.89 288.45
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 4.20830 14.88253 0.28
## food.groupBaked Products 10.12727 15.08122 0.67
## food.groupBeef Products -58.93361 15.37465 -3.83
## food.groupBeverages 26.91634 18.79306 1.43
## food.groupBreakfast Cereals 24.81798 13.81308 1.80
## food.groupCereal Grains and Pasta 8.81720 15.22526 0.58
## food.groupDairy and Egg Products -13.44246 28.44947 -0.47
## food.groupEthnic Foods -76.10468 38.02391 -2.00
## food.groupFast Foods -25.78984 14.86540 -1.73
## food.groupFats and Oils -21.61226 16.48300 -1.31
## food.groupFinfish and Shellfish Products -13.96120 37.39451 -0.37
## food.groupFruits and Fruit Juices -7.64936 28.37866 -0.27
## food.groupLamb, Veal, and Game Products -61.79421 21.48741 -2.88
## food.groupLegumes and Legume Products -32.41212 15.09434 -2.15
## food.groupMeals, Entrees, and Sidedishes -16.86988 37.28283 -0.45
## food.groupNut and Seed Products 167.86470 18.20938 9.22
## food.groupPork Products -54.66799 15.03012 -3.64
## food.groupPoultry Products -56.31371 15.70695 -3.59
## food.groupSausages and Luncheon Meats -42.88334 15.87745 -2.70
## food.groupSnacks 51.90391 14.48184 3.58
## food.groupSoups, Sauces, and Gravies 0.95879 15.90691 0.06
## food.groupSpices and Herbs 50.40825 18.57745 2.71
## food.groupSweets 24.74113 14.28421 1.73
## food.groupVegetables and Vegetable Products -1.49833 17.27685 -0.09
## Protein 3.17839 0.24739 12.85
## Total.lipid..fat. 0.24549 0.10026 2.45
## Carbohydrate..by.difference -0.32918 0.11305 -2.91
## Fiber..total.dietary 4.25724 0.31860 13.36
## Calcium..Ca 0.01626 0.00695 2.34
## Vitamin.C..total.ascorbic.acid 0.30657 0.05128 5.98
## Pr(>|t|)
## (Intercept) 0.77743
## food.groupBaked Products 0.50209
## food.groupBeef Products 0.00014 ***
## food.groupBeverages 0.15248
## food.groupBreakfast Cereals 0.07277 .
## food.groupCereal Grains and Pasta 0.56268
## food.groupDairy and Egg Products 0.63670
## food.groupEthnic Foods 0.04569 *
## food.groupFast Foods 0.08316 .
## food.groupFats and Oils 0.19018
## food.groupFinfish and Shellfish Products 0.70899
## food.groupFruits and Fruit Juices 0.78758
## food.groupLamb, Veal, and Game Products 0.00414 **
## food.groupLegumes and Legume Products 0.03208 *
## food.groupMeals, Entrees, and Sidedishes 0.65105
## food.groupNut and Seed Products < 2e-16 ***
## food.groupPork Products 0.00029 ***
## food.groupPoultry Products 0.00036 ***
## food.groupSausages and Luncheon Meats 0.00707 **
## food.groupSnacks 0.00036 ***
## food.groupSoups, Sauces, and Gravies 0.95195
## food.groupSpices and Herbs 0.00681 **
## food.groupSweets 0.08366 .
## food.groupVegetables and Vegetable Products 0.93091
## Protein < 2e-16 ***
## Total.lipid..fat. 0.01456 *
## Carbohydrate..by.difference 0.00370 **
## Fiber..total.dietary < 2e-16 ***
## Calcium..Ca 0.01954 *
## Vitamin.C..total.ascorbic.acid 3.4e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34.8 on 775 degrees of freedom
## (6733 observations deleted due to missingness)
## Multiple R-squared: 0.678, Adjusted R-squared: 0.666
## F-statistic: 56.3 on 29 and 775 DF, p-value: <2e-16
##
library(randomForest)
## randomForest 4.6-6
## Type rfNews() to see new features/changes/bug fixes.
rf <- randomForest(factor(Magnesium..Mg > 50) ~ . - X - food.id -
Sodium..Na - Sugars..total - Iron..Fe - Vitamin.A..RAE - Cholesterol - Fatty.acids..total.trans -
Fatty.acids..total.saturated - Vitamin.A..RAE, merged, na.action = na.omit)
print(rf)
##
## Call:
## randomForest(formula = factor(Magnesium..Mg > 50) ~ . - X - food.id - Sodium..Na - Sugars..total - Iron..Fe - Vitamin.A..RAE - Cholesterol - Fatty.acids..total.trans - Fatty.acids..total.saturated - Vitamin.A..RAE, data = merged, na.action = na.omit)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 4.84%
## Confusion matrix:
## FALSE TRUE class.error
## FALSE 631 14 0.02171
## TRUE 25 135 0.15625
varImpPlot(rf)