1 Import Nutrition Data

Data downloaded from Kaggle.

https://www.kaggle.com/datasets/utsavdey1410/food-nutrition-dataset

Clear the workspace and read the read.csv help files

remove(list=ls())

?read.csv # open the help file 

My original import commands - you have to change the file path to your working directory

FOOD.DATA.GROUP1 <- read.csv("~/Dropbox/WCAS/Summer/Data Analysis/Summer 2024/Day 2/FINAL FOOD DATASET/FOOD-DATA-GROUP1.csv")    # you need to identifiy the key argument in the command 

FOOD.DATA.GROUP1 <- read.csv(file = "~/Dropbox/WCAS/Summer/Data Analysis/Summer 2024/Day 2/FINAL FOOD DATASET/FOOD-DATA-GROUP1.csv")  # better coding practice is to specify the key argument 

FOOD.DATA.GROUP2 <- read.csv(file =  "~/Dropbox/WCAS/Summer/Data Analysis/Summer 2024/Day 2/FINAL FOOD DATASET/FOOD-DATA-GROUP1.csv",
                             header = TRUE) # explicitly specifying the default argument does not chnage anything, but might be a good practice when you are new

FOOD.DATA.GROUP2 <- read.csv(file =  "~/Dropbox/WCAS/Summer/Data Analysis/Summer 2024/Day 2/FINAL FOOD DATASET/FOOD-DATA-GROUP1.csv",
                             header = FALSE)

This piece of code is better as you do not have to change the file path and can simply run the code without making any changes, as long as your maintain the original folder (do not delete the subfolder FINAL FOOD DATASET.

FOOD.DATA.GROUP1 <- read.csv("FINAL FOOD DATASET/FOOD-DATA-GROUP1.csv")    # you need to identfiy the key argument in the command 

FOOD.DATA.GROUP2 <- read.csv(file =  "FINAL FOOD DATASET/FOOD-DATA-GROUP1.csv",
                             header = FALSE)

1.1 Sub setting data

We will create a healthy and unhealthy panel of food, based on Nutrition.Density values.

?remove
remove(FOOD.DATA.GROUP2)

FOOD.DATA.GROUP1$X <- NULL

FOOD.DATA.GROUP1$healthy <- FOOD.DATA.GROUP1$Nutrition.Density > 100

df_healthy_food   <- FOOD.DATA.GROUP1[FOOD.DATA.GROUP1$healthy, ]
df_unhealthy_food <- FOOD.DATA.GROUP1[!FOOD.DATA.GROUP1$healthy, ]

1.2 Sub setting large data

You can play with the nrows argument to import only a small subset of the original data.

FOOD.DATA.GROUP2 <- read.csv(file = "~/Dropbox/WCAS/Summer/Data Analysis/Summer 2024/Day 2/FINAL FOOD DATASET/FOOD-DATA-GROUP1.csv",
                             header = T, 
                             nrows = 10 )

2 Summary Stats

We will use the psych package. Use the stargazer package if you wish instead.

  • Make sure the package is installed and the package is loaded.
# install.packages("psych") # installation - only once
library(psych)
  • Now you can use the package.
describe(df_healthy_food)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
##                      vars   n   mean     sd median trimmed    mad    min    max
## Unnamed..0              1 246 248.69 160.28 246.00  244.26 199.41   1.00  550.0
## food*                   2 246 123.50  71.16 123.50  123.50  91.18   1.00  246.0
## Caloric.Value           3 246 373.58 211.94 347.50  348.85 172.72  71.00 1578.0
## Fat                     4 246  18.22  15.04  14.60   16.04  12.45   0.00   87.5
## Saturated.Fats          5 246   6.36   6.46   4.75    5.30   4.67   0.00   43.5
## Monounsaturated.Fats    6 246   6.20   5.99   4.55    5.29   4.45   0.00   34.2
## Polyunsaturated.Fats    7 246   3.51   4.64   1.95    2.68   2.00   0.00   40.1
## Carbohydrates           8 246  24.36  24.56  22.70   21.47  32.02   0.00  128.3
## Sugars                  9 246   4.41  10.29   0.02    2.03   0.03   0.00   70.8
## Protein                10 246  27.56  21.77  20.65   24.18  16.01   2.00   86.9
## Dietary.Fiber          11 246   1.56   2.67   0.00    0.97   0.00   0.00   17.5
## Cholesterol            12 246  92.86  81.35  70.40   83.32  71.68   0.00  352.5
## Sodium                 13 246   0.78   0.63   0.70    0.72   0.59   0.00    6.1
## Water                  14 246 126.26  83.34 108.60  119.40  82.43   1.10  489.3
## Vitamin.A              15 246   0.10   0.22   0.05    0.06   0.07   0.00    2.1
## Vitamin.B1             16 246   0.25   0.25   0.20    0.21   0.18   0.00    1.9
## Vitamin.B11            17 246   0.07   0.11   0.06    0.06   0.05   0.00    1.3
## Vitamin.B12            18 246   0.04   0.04   0.04    0.04   0.04   0.00    0.4
## Vitamin.B2             19 246   0.32   0.35   0.30    0.28   0.15   0.00    3.8
## Vitamin.B3             20 246   5.85   7.83   3.30    4.12   3.71   0.00   57.8
## Vitamin.B5             21 246   1.32   2.56   0.80    0.92   0.74   0.00   31.4
## Vitamin.B6             22 246   0.40   0.57   0.20    0.28   0.22   0.00    4.3
## Vitamin.C              23 246   3.37   6.62   0.40    1.77   0.59   0.00   42.9
## Vitamin.D              24 246   0.18   1.93   0.00    0.01   0.00   0.00   29.3
## Vitamin.E              25 246   0.90   1.67   0.20    0.53   0.30   0.00   14.1
## Vitamin.K              26 246   0.06   0.28   0.01    0.02   0.01   0.00    3.1
## Calcium                27 246 189.19 220.75 115.20  136.22  87.10   0.00 1283.5
## Copper                 28 246   5.36  48.74   0.20    0.19   0.15   0.00  668.6
## Iron                   29 246   2.41   2.20   2.10    2.14   1.48   0.01   21.1
## Magnesium              30 246  57.45  56.44  42.15   48.15  31.21   0.00  376.2
## Manganese              31 246   1.82  10.66   0.20    0.27   0.26   0.00  111.6
## Phosphorus             32 246 363.23 274.03 297.40  325.62 212.68   0.00 1385.1
## Potassium              33 246 516.38 454.11 373.70  449.24 310.83   0.00 3198.4
## Selenium               34 246  16.88 127.12   0.05    0.05   0.05   0.00 1351.6
## Zinc                   35 246   2.76   9.47   1.60    1.84   1.04   0.00  147.3
## Nutrition.Density      36 246 266.76 224.81 201.44  212.55 105.03 100.10 1337.0
## healthy                37 246    NaN     NA     NA     NaN     NA    Inf   -Inf
##                        range  skew kurtosis    se
## Unnamed..0            549.00  0.18    -1.22 10.22
## food*                 245.00  0.00    -1.21  4.54
## Caloric.Value        1507.00  1.83     6.25 13.51
## Fat                    87.50  1.57     3.08  0.96
## Saturated.Fats         43.50  2.40     8.63  0.41
## Monounsaturated.Fats   34.20  1.93     5.13  0.38
## Polyunsaturated.Fats   40.10  4.06    24.95  0.30
## Carbohydrates         128.30  1.05     1.55  1.57
## Sugars                 70.80  4.03    17.86  0.66
## Protein                84.90  1.21     0.54  1.39
## Dietary.Fiber          17.50  2.58     8.66  0.17
## Cholesterol           352.50  0.93    -0.04  5.19
## Sodium                  6.10  2.78    18.96  0.04
## Water                 488.20  0.96     1.33  5.31
## Vitamin.A               2.10  6.07    44.77  0.01
## Vitamin.B1              1.90  2.78    12.83  0.02
## Vitamin.B11             1.30  7.12    66.91  0.01
## Vitamin.B12             0.40  3.05    23.41  0.00
## Vitamin.B2              3.80  5.98    50.89  0.02
## Vitamin.B3             57.80  2.75     9.74  0.50
## Vitamin.B5             31.40  8.04    82.46  0.16
## Vitamin.B6              4.30  2.99    11.99  0.04
## Vitamin.C              42.90  3.28    12.66  0.42
## Vitamin.D              29.30 14.18   209.05  0.12
## Vitamin.E              14.10  3.78    19.65  0.11
## Vitamin.K               3.10  9.00    85.51  0.02
## Calcium              1283.50  2.59     6.51 14.07
## Copper                668.60 11.67   145.88  3.11
## Iron                   21.09  3.68    23.60  0.14
## Magnesium             376.20  2.86    10.92  3.60
## Manganese             111.60  7.71    63.19  0.68
## Phosphorus           1385.10  1.30     1.55 17.47
## Potassium            3198.40  1.71     4.54 28.95
## Selenium             1351.60  8.33    73.33  8.10
## Zinc                  147.30 14.47   217.48  0.60
## Nutrition.Density    1236.90  2.49     5.92 14.33
## healthy                 -Inf    NA       NA    NA

COMMENT ON THE RESULTS….

read.csv help files gives us some instructions on how to use the command.

## using count.fields to handle unknown maximum number of fields
## when fill = TRUE
test1 <- c(1:5, "6,7", "8,9,10")
tf <- tempfile()
writeLines(test1, tf)

read.csv(tf, fill = TRUE) # 1 column
ncol <- max(count.fields(tf, sep = ","))
read.csv(tf, fill = TRUE, header = FALSE,
         col.names = paste0("V", seq_len(ncol)))
unlink(tf)

## "Inline" data set, using text=
## Notice that leading and trailing empty lines are auto-trimmed

read.table(header = TRUE, text = "
a b
1 2
3 4
")